Seawater samples were collected for (1) microbial metagenomics and (2) physico-chemical data (temperature, salinity, and particulate/dissolved nutrient concentrations) from 48 offshore reefs across the length of the GBR, within the Great Barrier Reef Microbial Genomics Database (GBR-MGD) initiative by Australia’s Integrated Marine Observing System (IMOS). This sampling was done alongside ongoing in situ health surveys by the Australian Institute of Marine Science Long-Term Monitoring Program (AIMS-LTMP).
The code below was used to create a map showing the 48 IMOS GBR-MGD sites, by combining these two tutorials: 1. https://open-aims.github.io/gisaimsr/articles/examples.html 2. https://r-spatial.org/r/2018/10/25/ggplot2-sf-2.html
# Importing the coordinates
map_coords <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Tom_Jenkins_script/all_IMOS-MGD_seawater_subset/Metadata_files/MARKO_for_eReefs_Lats_Longs.csv")
# View(map_coords)
# Now converting from data frame into sf format
map_coords <- st_as_sf(map_coords,
coords = c("lon", "lat"),
remove = FALSE,
crs = 4283, # this is the reference code for the CRS system GDA94, used by dataaimsr & gisaimsr R packages
agr = "constant")
# I will now add the info on Sampling trip - this will be needed when plotting
# This is the final metadata file, with average values of env. measurements (averaged per Reef site)
map_reef_names_and_trip <- read.csv(file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/metadata_with_reef_names_maps.csv") %>%
dplyr::select(REEF_NAME, Sampling_trip)
### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
map_reef_names_and_trip$Sampling_trip <- gsub("First", # String to search for
"Trip_01_Nov-Dec_2019", # Replace with this
as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in
# Second trip
map_reef_names_and_trip$Sampling_trip <- gsub("Second", # String to search for
"Trip_02_January_2020", # Replace with this
as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in
# Third trip
map_reef_names_and_trip$Sampling_trip <- gsub("Third", # String to search for
"Trip_03_February_2020", # Replace with this
as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in
# Fourth trip
map_reef_names_and_trip$Sampling_trip <- gsub("Fourth", # String to search for
"Trip_04_July_2020", # Replace with this
as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in
# Merging with 'Sampling trip' info
map_coords <- left_join(map_coords, map_reef_names_and_trip, by = c("name" = "REEF_NAME"))
#########################
# Now plotting
# And now defining colors for the map
cols_map <- c("tomato3", # Enclosed Coastal
"salmon3", # Macro Tidal Enclosed Coastal
"pink3", # Macro Tidal Open Coastal
"peachpuff", # Midshelf
"lightsteelblue", # Offshore
"lightcoral") # Open coastal
# Plotting without the mainland - otherwise I am just losing precious space
gbr_no_mainland <- gbr_feat %>%
dplyr::filter(FEAT_NAME != "Mainland")
# ------------------------------------------------ #
# Overlaying IMOS-MGD sites - only as points first #
# ------------------------------------------------ #
col.per.trip <- factor(map_coords$Sampling_trip, levels = c("Trip_01_Nov-Dec_2019",
"Trip_02_January_2020",
"Trip_03_February_2020",
"Trip_04_July_2020"))
colors <- c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
names(colors) <- c("Trip_01_Nov-Dec_2019",
"Trip_02_January_2020",
"Trip_03_February_2020",
"Trip_04_July_2020")
# Importing city coordinates
oz_cities <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/oz_cities.csv")
# Plot
IMOS_MGD_dots_trip = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
# Include the region info too (the 3 lines below)
# geom_sf(data = nrm_regions,
# mapping = aes(fill = NAME), lwd = 0.01) +
# scale_fill_brewer(name = "Region", palette = "Set3") +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat,
lwd = 0.01,
fill = "seashell2",
colour = NA
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
# alpha = 0.6, # This is to ensure
show.legend = "point") +
coord_sf(xlim = c(142, 154), ylim = c(-10, -27)) +
geom_text_repel(data = oz_cities, aes(x = Longitude, y = Latitude, label = Town), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
col = 'black',
nudge_x = c(-0.5, # Townsville
-0.9, # Brisbane
-0.5, # Cairns
-0.8, # Cooktown
-0.5, # Mackay
-0.7), # Bundaberg
nudge_y = c(-0.5, # Townsville
0.5, # Brisbane
-0.5, # Cairns
-0.5, # Cooktown
-0.5, # Mackay
-0.5), # Bundaberg
)+
scale_color_manual(name = "Dates of Sampling Transects", values=colors)+
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5, linetype = "solid")) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS-MGD",
subtitle = "Microbial Genomics Database sites") +
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_dots_trip
Field sampling design for the GBR-MGD (Great Barrier Reef Microbial Genomics Database) dataset. (above) Seawater was collected from 48 offshore GBR reef sites for microbial community metagenomic sequencing and water chemistry analysis over four trips between November 2019 and July 2020. Reef sites are coloured in red or blue tones to denote trips that occurred during the austral summer (wet season) or austral winter (dry season), respectively. (bellow) A more detailed map showing the name of each reef site, and their membership to either offshore (41 reefs) or mid-shelf (7 reefs) waters. No inshore sites were sampled.
# Plot
IMOS_MGD_dots_trip_black = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
# Include the region info too (the 3 lines below)
# geom_sf(data = nrm_regions,
# mapping = aes(fill = NAME), lwd = 0.01) +
# scale_fill_brewer(name = "Region", palette = "Set3") +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat,
lwd = 0.01,
fill = "black",
colour = NA
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
# alpha = 0.6, # This is to ensure
show.legend = "point") +
coord_sf(xlim = c(142, 154), ylim = c(-10, -27)) +
geom_text_repel(data = oz_cities, aes(x = Longitude, y = Latitude, label = Town), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
col = 'black',
nudge_x = c(-0.5, # Townsville
-0.9, # Brisbane
-0.5, # Cairns
-0.8, # Cooktown
-0.5, # Mackay
-0.7), # Bundaberg
nudge_y = c(-0.5, # Townsville
0.5, # Brisbane
-0.5, # Cairns
-0.5, # Cooktown
-0.5, # Mackay
-0.5), # Bundaberg
)+
scale_color_manual(name = "Dates of Sampling Transects", values=colors)+
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5,
linetype = "solid"
)) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS-MGD",
subtitle = "Microbial Genomics Database sites") +
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_dots_trip_black
Field sampling design for the GBR-MGD (Great Barrier Reef Microbial Genomics Database) dataset. (above) Seawater was collected from 48 offshore GBR reef sites for microbial community metagenomic sequencing and water chemistry analysis over four trips between November 2019 and July 2020. Reef sites are coloured in red or blue tones to denote trips that occurred during the austral summer (wet season) or austral winter (dry season), respectively. (bellow) A more detailed map showing the name of each reef site, and their membership to either offshore (41 reefs) or mid-shelf (7 reefs) waters. No inshore sites were sampled.
To show reefs in more detail, we also plot a close-up of sites within each trip.
# But need to split map_coords file per trip
map_coords_trip1 <- filter(map_coords, Sampling_trip=="Trip_01_Nov-Dec_2019")
map_coords_trip2 <- filter(map_coords, Sampling_trip=="Trip_02_January_2020")
map_coords_trip3 <- filter(map_coords, Sampling_trip=="Trip_03_February_2020")
map_coords_trip4 <- filter(map_coords, Sampling_trip=="Trip_04_July_2020")
IMOS_MGD_trip1 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
geom_sf() +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat,
lwd = 0.01,
fill = "seashell2",
colour = NA
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip1, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
show.legend = "point") +
geom_text_repel(data = map_coords_trip1, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
segment.color = "black",
segment.alpha = 0.6,
segment.size = 0.1,
nudge_x = c(2.2, # MCSWEENEY REEF
2.4, # MONSOON REEF, - sign means it will move to the left
1.2, # 11-049
1.2, # 11-162
0.9, # MANTIS REEF
1.6, # LAGOON REEF
0.4, # DAVIE REEF
-0.2, # CORBETT REEF
0.4, # 13-124
-0.1, # SANBANK 1 REEF
0.5), # St Crispin
# This should be 48 times, for our 48 sites
nudge_y = c(0.2, # MCSWEENEY REEF
0.1, # MONSOON REEF
0.1, # 11-049, - sign means it will go down
0.1, # 11-162
0.2, # MANTIS REEF
-0.3, # LAGOON REEF
0.2, # DAVIE REEF
-0.8, # CORBETT REEF
0.3, # 13-124
-1.6, # SANDBANK 1 REEF
0.5), ) + # St Crispin
coord_sf(xlim = c(143, 147), ylim = c(-11, -16.5)) +
scale_color_manual(name = "Sampling trip", values=c("indianred")) + # color I am using for Sampling trip 1
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5, linetype = "solid")) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS Microbial Genomics Database sites",
subtitle = "Trip 1 (Nov-Dec 2019)")
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
# theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip1
IMOS_MGD_trip2 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
geom_sf() +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat,
lwd = 0.01,
fill = "seashell2",
colour = NA
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip2, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
show.legend = "point") +
geom_text_repel(data = map_coords_trip2, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
segment.color = "black",
segment.alpha = 0.6,
segment.size = 0.1,
nudge_x = c(-0.1, # FAIRFAX REEF
-0.4, # HOSKYN REEF
0.3, # BOULT REEF
0.3, # MASTHEAD REEF
-0.2, # ERSKINE REEF
0.4, # BROOMFIELD REEF
0.1, # 21-550
-0.5, # 22-084
0.4, # CHINAMAN REEF
-0.1, # 21-580
0.2, # SMALL LAGOON REEF
-0.3), # NORTH REEF
# This should be 48 times, for our 48 sites
nudge_y = c(-0.1, # FAIRFAX REEF
-0.1, # HOSKYN REEF
0.2, # BOULT REEF
-0.1, # MASTHEAD REEF
0.2, # ERSKINE REEF
0.2, # BROOMFIELD REEF
0.2, # 21-550
-0.3, # 22-084
0.2, # CHINAMAN REEF
-0.6, # 21-580
0.4, # SMALL LAGOON REEF
0.1), # NORTH REEF
) + # SANDBANK 1 REEF
coord_sf(xlim = c(151, 153), ylim = c(-21.5, -24)) +
scale_color_manual(name = "Sampling trip", values=c("indianred4")) + # color I am using for Sampling trip 1
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5,
linetype = "solid")) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS Microbial Genomics Database sites",
subtitle = "Trip 2 (January 2020)")
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
# theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip2
IMOS_MGD_trip3 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
geom_sf() +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat,
lwd = 0.01,
fill = "seashell2",
colour = NA
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip3, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
show.legend = "point") +
geom_text_repel(data = map_coords_trip3, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
segment.color = "black",
segment.alpha = 0.6,
segment.size = 0.1,
nudge_x = c(#0.3, # ST CRISPIN
0.3, # AGINCOURT1 REEF
0.3, # HASTINGS REEF
0.3, # ARLINGTON REEF
0.4, # THETFORD REEF
0.4, # MOORE REEF
0.3, # HEDLEY REEF
0.3, # MCCULLOCH REEF
0.4, # PEART REEF
0.4, # FEATHER REEF
0.1, # FARQUAHARSON REEF
0.3), # TAYLOR REEF
# This should be 48 times, for our 48 sites
nudge_y = c(#-0.1, # ST CRISPIN
0.2, # AGINCOURT1 REEF
0.2, # HASTINGS REEF
0.1, # ARLINGTON REEF
0.2, # THETFORD REEF
0.1, # MOORE REEF
0.1, # HEDLEY REEF
0.2, # MCCULLOCH REEF
0.1, # PEART REEF
-0.1, # FEATHER REEF
0.2, # FARQUAHARSON REEF
-0.1), # TAYLOR REEF
) +
coord_sf(xlim = c(145.4, 147), ylim = c(-15.8, -18)) +
scale_color_manual(name = "Sampling trip", values=c("red"))+
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5,
linetype = "solid")) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS Microbial Genomics Database sites",
subtitle = "Trip 3 (February 2020)") +
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip3
IMOS_MGD_trip4 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
# geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
geom_sf() +
# geom_sf(data = wbodies,
# mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
# lwd = 0.01) +
geom_sf(data = gbr_feat,
lwd = 0.01,
fill = "seashell2",
colour = NA
) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip4, # Needs to be a data frame, requires 'geometry'
aes(color = Sampling_trip), # Coloring sites per sampling trip
show.legend = "point") +
geom_text_repel(data = map_coords_trip4, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
fontface = "bold", # to have the reef names in bold
size=3.2,
segment.color = "black",
segment.alpha = 0.6,
segment.size = 0.1,
nudge_x = c(-0.2, # LITTLE KELSO REEF
-0.2, # KELSO REEF
0.5, # ROXBURGH REEF
0.2, # FORE&AFT REEF
0.2, # RIB REEF
-0.1, # JOHN BREWER REEF
0.1, # MYRMIDON REEF
-0.3, # CHICKEN REEF
0.3, # KNIFE REEF
0.2, # FORK REEF
0.2, # LYNCHS REEF
-0.1, # CENTIPEDE REEF
-0.1, # GRUB REEF
-0.2), # HELIX REEF
# This should be 48 times, for our 48 sites
nudge_y = c(-0.2, # LITTLE KELSO REEF
-0.1, # KELSO REEF
0.1, # ROXBURGH REEF
0, # FORE&AFT REEF
0.1, # RIB REEF
-0.2, # JOHN BREWER REEF
0.1, # MYRMIDON REEF
-0.3, # CHICKEN REEF
0.1, # KNIFE REEF
0, # FORK REEF
-0.2, # LYNCHS REEF
-0.1, # CENTIPEDE REEF
-0.1, # GRUB REEF
-0.2), # HELIX REEF
) +
coord_sf(xlim = c(146.8, 148), ylim = c(-18.1, -19)) +
scale_color_manual(name = "Sampling trip", values=c("slateblue"))+
theme_classic() +
theme(panel.background = element_rect(fill = "lightblue3",
colour = "lightblue3",
size = 0.5,
linetype = "solid")) +
labs(x = "Longitude",
y = "Latitude",
title = "IMOS Microbial Genomics Database sites",
subtitle = "Trip 4 (July 2020)") +
# scale_fill_manual(name = "Type of Water Body", values = cols_map) +
theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip4
plot_grid(IMOS_MGD_trip1, IMOS_MGD_trip2, IMOS_MGD_trip3, IMOS_MGD_trip4,
nrow = 1,
ncol = 4)
wrangling <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/metadata_wrangling.csv")
WQ_methods <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/WQ_IDs_IMOS-MGD_only.csv")
# And this is one of the WQ spreadsheets - link between WQ IDs and our Reef names
# Selecting only the columns of interest
WQ_methods <- dplyr::select(WQ_methods, one_of(c("REEF_NAME",
"WQ_Station_Name",
"Collection_method", # diving or from boat
"Sample_collection_start")))#,
# We decided not to include the metrics bellow
# "Swell_direction",
# "Swell_height",
# "Wind_direction",
#"Wind_speed" )))
# Additional data from the LTMP trips
LTMP <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/metadata_IMOS_from_Mike.csv") %>%
dplyr::select("REEF_NAME", "Sampling_trip", "GBR_sector", "SAMPLE_DATE", "Lat", "Long")
### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
LTMP$Sampling_trip <- gsub("First", # String to search for
"Trip_01_Nov-Dec_2019", # Replace with this
as.character(LTMP$Sampling_trip)) # Column to search in
# Second trip
LTMP$Sampling_trip <- gsub("Second", # String to search for
"Trip_02_January_2020", # Replace with this
as.character(LTMP$Sampling_trip)) # Column to search in
# Third trip
LTMP$Sampling_trip <- gsub("Third", # String to search for
"Trip_03_February_2020", # Replace with this
as.character(LTMP$Sampling_trip)) # Column to search in
# Fourth trip
LTMP$Sampling_trip <- gsub("Fourth", # String to search for
"Trip_04_July_2020", # Replace with this
as.character(LTMP$Sampling_trip)) # Column to search in
# Joining - first step
metadata <- left_join(wrangling, LTMP)
# In this step I added the Sample IDs to the LTMP data
metadata <- left_join(metadata, WQ_methods)
# And in here the info from the WQ team
# importing the actual water chemistry measurements
WQ_Result_Report <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/MBM_Result_Report_R_Dec_2022.csv")
# Removing Temperature and Salinity for now - too many missing values
WQ_Result_Report <- dplyr::select(WQ_Result_Report, one_of(c("WQ_Station_Name",
"DEPTH",
"Chlorophyll_a_.µg.L.",
"Phaeophytin_a_.µg.L.",
"PN_.µM.",
"POC_.µM.",
"PP_.µM.",
"DOC_.µM.",
"PO4_.µM.",
"NH4_.µM.",
"NO2_.µM.",
"NO3_.µM.",
"Si_.µM.",
"TDN_.µM.",
"TDP_.µM.",
"TSS_.mg.L.")))
# "Salinity",
# "Temperature.C..")))
# Now adding the Reef_name info
wq.all.reps.for.pca <- left_join(WQ_methods, WQ_Result_Report)
# I also need the info on Sampling trips - to color the groups on the PCA. I will also add the data from the research vessel at this stage - Temperature, Salinity, Turbidity, Fluorescence
reefs_trips <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/metadata_with_reef_names.csv")
# But I only want reef names and their corresponding trips for now!
reefs_trips <- reefs_trips[,c(1,3)]
### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
reefs_trips$Sampling_trip <- gsub("First", # String to search for
"Trip_01_Nov-Dec_2019", # Replace with this
as.character(reefs_trips$Sampling_trip)) # Column to search in
# Second trip
reefs_trips$Sampling_trip <- gsub("Second", # String to search for
"Trip_02_January_2020", # Replace with this
as.character(reefs_trips$Sampling_trip)) # Column to search in
# Third trip
reefs_trips$Sampling_trip <- gsub("Third", # String to search for
"Trip_03_February_2020", # Replace with this
as.character(reefs_trips$Sampling_trip)) # Column to search in
# Fourth trip
reefs_trips$Sampling_trip <- gsub("Fourth", # String to search for
"Trip_04_July_2020", # Replace with this
as.character(reefs_trips$Sampling_trip)) # Column to search in
# Now adding the metadata from the RV
vessel_metadata <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/GBR-Genomics-Database_Seawater-Illumina-Reads.csv")
# Keeping only: Temperature, Salinity, Turbidity, Fluorescence
vessel_metadata <- dplyr::select(vessel_metadata, one_of(c("REEF_NAME",
"SEAWATER_TEMPERATURE_2.5m_RV",
"SALINITY_2.5m_RV",
# "TURBIDITY_2.5m_RV",
"FLUORESCENCE_2.5m_RV")))
# Joining now
reefs_trips <- left_join(reefs_trips,
vessel_metadata)
wq.all.reps.for.pca <- left_join(wq.all.reps.for.pca,
reefs_trips)
PCA was applied on a physico-chemical dataset containing 17 variables, including: 1. 14 water chemistry variables: ammonia (NH4), nitrite (NO2), nitrate (NO3), total dissolved nitrogen (TDN), phosphate (PO4), total dissolved phosphorus (TDP), dissolved organic carbon (DOC), silicate (Si), total suspended solids (TSS), chlorophyll a (Chl-a), phaeophytin a (Phaeo), particulate organic carbon (POC), particulate nitrogen (PN), and particulate phosphorus (PP). For each of these 14 water chemistry variables, triplicate 5 L seawater samples were collected using Niskin bottles for analysis of water chemistry variables, at each of the 48 reefs. 2. temperature, fluorescence, and salinity measurements from the underway sampling systems on the RV Solander and RV Cape Ferguson, with intake depths for underway systems were 1.9 m (RV Cape Ferguson) and 2.5 m (RV Solander). For these three measurements, one value per reef site was recorded.
The mixOmics function tune.pca() calculates the cumulative proportion of explained variance for a large number of principal components (here we set ncomp = 10). A screeplot of the proportion of explained variance relative to the total amount of variance in the data for each principal component is output.
# In PCA, we first count the number of missing values, as this will tell us whether PCA will be solved using SVD (no missing values) or iterative NIPALS (with missing values) internally in the mixOmics function pca().
sum(is.na(wq.all.reps.for.pca[,c(6:14, 16:23)]))
## [1] 17
# Number of NAs
## [1] 17
# Since we have some missing values, the iterative NIPALS will be called inside pca()
tune.pca.WQ <- tune.pca(wq.all.reps.for.pca[,c(6:14, 16:23)], ncomp = 10, scale = TRUE)
plot(tune.pca.WQ)
Screeplot from the PCA performed on the IMOS GBR-MGD physico-chemical data: Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.
# Numerical output
pca.wq.all.reps <- pca(wq.all.reps.for.pca[,c(6:14, 16:23)], # getting the numerical values only
ncomp = 10,
center = TRUE,
scale = TRUE)
# Explained variance per PCA component
knitr::kable(pca.wq.all.reps$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
| x | |
|---|---|
| PC1 | 0.4066099 |
| PC2 | 0.1964915 |
| PC3 | 0.0910662 |
| PC4 | 0.0643839 |
| PC5 | 0.0470814 |
| PC6 | 0.0422856 |
| PC7 | 0.0305387 |
| PC8 | 0.0288760 |
| PC9 | 0.0231195 |
| PC10 | 0.0200885 |
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.wq.all.reps$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
| x | |
|---|---|
| PC1 | 0.4066099 |
| PC2 | 0.6031014 |
| PC3 | 0.6941676 |
| PC4 | 0.7585515 |
| PC5 | 0.8056329 |
| PC6 | 0.8479185 |
| PC7 | 0.8784572 |
| PC8 | 0.9073332 |
| PC9 | 0.9304527 |
| PC10 | 0.9505412 |
PCA_WQ_sample_plot <- plotIndiv(pca.wq.all.reps,
comp = c(1, 2),
group = wq.all.reps.for.pca$Sampling_trip,
# ind.names = wq.all.reps.for.pca$REEF_NAME,
ellipse = T,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend = TRUE,
title = 'WQ Metadata all reps, PCA comp 1 - 2')
PCA_WQ_biplot <- biplot(pca.wq.all.reps,
comp = c(1, 2),
group = wq.all.reps.for.pca$Sampling_trip,
# ind.names = wq.all.reps.for.pca$REEF_NAME,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend = TRUE,
legend.title = "Sampling trip",
title = 'PCA biplot for WQ Metadata all reps, PCA comp 1 - 2')
The PCA results suggest that our water chemistry measurements from across the GBR were largely driven by seasonality, while geography had a weaker influence. Chemistry profiles of samples collected in early austral summer were comparable despite being >1500 km apart in the far north (Cape Grenville and Princess Charlotte bay sectors) and far south (Swains and Capricorn Bunker sectors) of the GBR, whereas samples collected during the peaks of austral summer and winter were the most distinct although they were geographically close in the central GBR (~200 km apart, Cairns and Cooktown / Lizard island sectors for austral summer samples, and Innisfail and Townsville sectors for austral winter samples). Further, we observe that summer trips 1-3 were characterised by elevated temperature and higher concentrations of dissolved and particulate nutrients, apart frpm TDP and phosphate which were elevated during winter.
However, we did not show reef names in either of the PCA plots as there is an overlap between data points (and hence the text is not readable), and also in these PCA visualisations, we lose context of raw values. This information was added with a heatmap (to compare physico-chemical metrics across sites) and with boxplots (which show the raw physico-chemical measurements).
We first collapsed the data to a mean/median value because for each of the 17 environmental metrics we computed the median value per reef site as the number of Niskin deployments differed for molecular (four replicates) and water chemistry (three replicates) sampling.
# Making the heatmap of LTMP data
WQ_heatmap <- metadata[,24:40] %>% # I am only choosing columns with median values
scale(center = TRUE, scale = TRUE) %>% # I wand the values to be scaled
as.data.frame() %>% # Converting back to data frame - ggplot needs this
rownames_to_column("Sample_ID") %>% # Setting rownames as Col 1 - will need this for melting
reshape2::melt() %>% # Getting the long format - this is what geom_tile needs
left_join(metadata[, c(1,2)] # adding back the REEF NAME and SAMPLING TRIP vars
%>% rownames_to_column("Sample_ID")
) %>% # Need to convert row names to Column 1 and give Sample_ID as name, because I am joining those with the same ID
ggplot(aes(x = REEF_NAME, y = variable, fill = value)) +
geom_tile() + # Plotting the heatmap here
scale_fill_gradient2(low = "#075AFF",
mid = "#FFFFCC",
high = "#FF0000") + # The coloring scheme - red for high vals, blue for low
facet_wrap(~Sampling_trip, scales = "free_x", ncol = 4) + # now facetting reef sites based on the Sampling trip
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
WQ_heatmap
The heatmap shows the level of change in all 17 physico-chemical variables (y axis) across the reef sites (x axis), grouped within their corresponding sampling trip. Environmental measurements were centered (median = 0) and scaled (standard deviation (SD) = 1) across reef sites, and values that deviate from the median (0) were shown in red (> median) and blue (< median). This heatmap was combined in Inkscape with the PCA visualisation for physico-chemical data to re-introduce the context of reef sites, which were not visualised in the PCA.
# Median is the default in ggplot2
reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
6:19, # All numerical vals
21, # Temperature
22, # Salinity
23, # Fluorescence
20)]) %>% # Sampling trip
ggplot(aes(y = value,
x = Sampling_trip,
fill = Sampling_trip),
alpha=0.8) +
geom_boxplot(#outlier.colour="red",
outlier.shape=8,
outlier.size=4) +
geom_jitter(alpha = 0.6,
size = 0.8) +
stat_summary(fun=mean,
geom="point",
shape=20,
size=0.8,
color="seagreen1",
fill="seagreen1") + # Plotting the mean as a green dot!
facet_grid(rows = vars(variable),
cols = vars(Sampling_trip),
scales = "free"
) +
scale_fill_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
) +
labs(y = "WQ metrics",
x = "Reef sites",
title = "Boxplots for WQ metrics (Median & Mean)"
) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8))
FIG CAP TO BE ADDED.
Getting the numerical summary of physico-chemical variables
# Data needs to be in long format
wq_median_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
6:19, # All numerical vals
21, # Temperature
22, # Salinity
23, # Fluorescence
20)]) %>% # Sampling_trip
as.data.frame() %>%
group_by(Sampling_trip, variable) %>%
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
median=round(median(value, na.rm=TRUE),
digits = 2)
) %>%
reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_median_per_trip, caption = "Median for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
| variable | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | Trip_03_February_2020 | Trip_04_July_2020 |
|---|---|---|---|---|
| Chlorophyll_a_.µg.L. | 0.17 | 0.15 | 0.23 | 0.10 |
| Phaeophytin_a_.µg.L. | 0.17 | 0.17 | 0.36 | 0.10 |
| PN_.µM. | 1.23 | 1.19 | 1.30 | 0.50 |
| POC_.µM. | 7.12 | 7.77 | 9.74 | 3.52 |
| PP_.µM. | 0.04 | 0.04 | 0.06 | 0.02 |
| DOC_.µM. | 83.75 | 79.58 | 65.83 | 69.53 |
| PO4_.µM. | 0.05 | 0.04 | 0.02 | 0.09 |
| NH4_.µM. | 0.32 | 0.52 | 0.68 | 0.11 |
| NO2_.µM. | 0.02 | 0.04 | 0.03 | 0.01 |
| NO3_.µM. | 0.21 | 0.34 | 0.21 | 0.20 |
| Si_.µM. | 1.38 | 1.14 | 2.00 | 1.86 |
| TDN_.µM. | 5.43 | 6.53 | 5.70 | 5.28 |
| TDP_.µM. | 0.20 | 0.23 | 0.16 | 0.26 |
| TSS_.mg.L. | 0.43 | 0.13 | 0.08 | 0.05 |
| SEAWATER_TEMPERATURE_2.5m_RV | 27.78 | 27.16 | 30.16 | 24.40 |
| SALINITY_2.5m_RV | 35.27 | 35.55 | 34.72 | 35.16 |
| FLUORESCENCE_2.5m_RV | 0.10 | 0.11 | 0.32 | 0.09 |
# Data needs to be in long format
wq_mean_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
6:19, # All numerical vals
21, # Temperature
22, # Salinity
23, # Fluorescence
20)]) %>% # Sampling_trip
as.data.frame() %>%
group_by(Sampling_trip, variable) %>%
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
mean=round(mean(value, na.rm=TRUE),
digits = 2)
) %>%
reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_mean_per_trip, caption = "Mean for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
| variable | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | Trip_03_February_2020 | Trip_04_July_2020 |
|---|---|---|---|---|
| Chlorophyll_a_.µg.L. | 0.18 | 0.16 | 0.32 | 0.11 |
| Phaeophytin_a_.µg.L. | 0.18 | 0.20 | 0.36 | 0.10 |
| PN_.µM. | 1.23 | 1.27 | 1.32 | 0.50 |
| POC_.µM. | 8.06 | 7.60 | 9.95 | 3.66 |
| PP_.µM. | 0.05 | 0.05 | 0.07 | 0.02 |
| DOC_.µM. | 84.51 | 81.92 | 67.22 | 69.30 |
| PO4_.µM. | 0.05 | 0.04 | 0.02 | 0.10 |
| NH4_.µM. | 0.39 | 0.58 | 0.74 | 0.12 |
| NO2_.µM. | 0.03 | 0.04 | 0.04 | 0.01 |
| NO3_.µM. | 0.30 | 0.33 | 0.35 | 0.23 |
| Si_.µM. | 1.41 | 1.30 | 2.10 | 1.79 |
| TDN_.µM. | 5.47 | 6.62 | 5.64 | 5.18 |
| TDP_.µM. | 0.20 | 0.23 | 0.16 | 0.26 |
| TSS_.mg.L. | 0.48 | 0.15 | 0.35 | 0.12 |
| SEAWATER_TEMPERATURE_2.5m_RV | 27.78 | 27.13 | 30.01 | 24.22 |
| SALINITY_2.5m_RV | 35.35 | 35.52 | 34.71 | 35.16 |
| FLUORESCENCE_2.5m_RV | 0.10 | 0.10 | 0.34 | 0.13 |
# Data needs to be in long format
wq_sd_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
6:19, # All numerical vals
21, # Temperature
22, # Salinity
23, # Fluorescence
20)]) %>% # Sampling_trip
as.data.frame() %>%
group_by(Sampling_trip, variable) %>%
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
sd=round(sd(value, na.rm=TRUE),
digits = 2)
) %>%
reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_sd_per_trip, caption = "SD for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
| variable | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | Trip_03_February_2020 | Trip_04_July_2020 |
|---|---|---|---|---|
| Chlorophyll_a_.µg.L. | 0.06 | 0.08 | 0.18 | 0.03 |
| Phaeophytin_a_.µg.L. | 0.04 | 0.08 | 0.15 | 0.02 |
| PN_.µM. | 0.35 | 0.46 | 0.22 | 0.10 |
| POC_.µM. | 2.86 | 1.89 | 2.29 | 1.00 |
| PP_.µM. | 0.02 | 0.02 | 0.03 | 0.01 |
| DOC_.µM. | 5.99 | 9.89 | 4.60 | 4.67 |
| PO4_.µM. | 0.03 | 0.02 | 0.02 | 0.02 |
| NH4_.µM. | 0.15 | 0.27 | 0.44 | 0.06 |
| NO2_.µM. | 0.02 | 0.01 | 0.02 | 0.00 |
| NO3_.µM. | 0.25 | 0.15 | 0.31 | 0.16 |
| Si_.µM. | 0.30 | 0.44 | 0.55 | 0.65 |
| TDN_.µM. | 0.83 | 0.82 | 0.72 | 0.75 |
| TDP_.µM. | 0.03 | 0.04 | 0.03 | 0.02 |
| TSS_.mg.L. | 0.41 | 0.15 | 0.52 | 0.10 |
| SEAWATER_TEMPERATURE_2.5m_RV | 0.43 | 0.61 | 0.39 | 0.95 |
| SALINITY_2.5m_RV | 0.21 | 0.17 | 0.05 | 0.04 |
| FLUORESCENCE_2.5m_RV | 0.01 | 0.02 | 0.05 | 0.12 |
# Exporting Medians as csv
write.csv(wq_median_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_Median_per_trip.csv", quote = F, row.names = F)
# Exporting Means as csv
write.csv(wq_mean_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_mean_per_trip.csv", quote = F, row.names = F)
# Exporting SD as csv
write.csv(wq_sd_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_SD_per_trip.csv", quote = F, row.names = F)
# The csv files on median and sd values were merged manually to make the Table 1 in the main text of the manuscript
Raw counts were exported from MEGAN as biom files separately for (1) microbial taxonomy (genus level as the lowest category) and for (2) microbial functions (GO terms), and subsequently imported into R using the phyloseq R package. These biom files were combined with the metadata file to create 2 phyloseq objects (for taxa and genes), which have then undergone various filtering steps.
### Importing the biom tables, exported from MEGAN
### Taxonomy info | at 'Genus' level
megan_genus <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/IMOS-MGD_Seawater_full_dataset_Genera_191_samples_Neg_controls_July_2024.biom")
### Functional info | GO terms
megan_GO_5 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/IMOS-MGD_Seawater_GOs_Rank5_191_samples_Neg_controls_July_2024.biom")
# This one has 7476 GO terms
megan_GO_4 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/IMOS-MGD_Seawater_GOs_Rank4_191_samples_Neg_controls_July_2024.biom")
# This one has 5257 GO terms
megan_GO_3 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/IMOS-MGD_Seawater_GOs_Rank3_191_samples_Neg_controls_July_2024.biom")
# This one has 705 GO terms
# Let's just modify the metadata file a bit to include neg controls as well
metadata_neg_controls <- left_join(megan_genus@otu_table %>%
t() %>%
as.data.frame() %>%
rownames_to_column("Sample_ID") %>%
dplyr::select("Sample_ID"),
metadata %>%
rownames_to_column("Sample_ID")) %>%
column_to_rownames("Sample_ID")
# Merging:
sample_data(megan_genus) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_5) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_4) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_3) <- sample_data(metadata_neg_controls)
# Checking the phyloseq objects
megan_genus
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2066 taxa and 207 samples ]
## sample_data() Sample Data: [ 207 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 2066 taxa by 7 taxonomic ranks ]
megan_GO_5
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 7476 taxa and 207 samples ]
## sample_data() Sample Data: [ 207 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 7476 taxa by 6 taxonomic ranks ]
megan_GO_4
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 5257 taxa and 207 samples ]
## sample_data() Sample Data: [ 207 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 5257 taxa by 4 taxonomic ranks ]
megan_GO_3
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 706 taxa and 207 samples ]
## sample_data() Sample Data: [ 207 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 706 taxa by 3 taxonomic ranks ]
# But I want to filter out the PF samples, and Broomfield rep 2 (because the sequencing was repeated for this one)
megan_genus <- subset_samples(megan_genus, sample_names(megan_genus)!='Lynchs-PF-1_S107_R1' &
sample_names(megan_genus)!='Lynchs-PF-2_S108_R1' &
sample_names(megan_genus)!='Lynchs-PF-3_S109_R1' &
sample_names(megan_genus)!='Lynchs-PF-4_S110_R1' &
sample_names(megan_genus)!='Myrmidon-PF-1_S111_R1' &
sample_names(megan_genus)!='Myrmidon-PF-2_S112_R1' &
sample_names(megan_genus)!='Myrmidon-PF-3_S113_R1' &
sample_names(megan_genus)!='Myrmidon-PF-4_S114_R1' &
sample_names(megan_genus)!='Rib-PF-1_S103_R1' &
sample_names(megan_genus)!='Rib-PF-2_S104_R1' &
sample_names(megan_genus)!='Rib-PF-3_S105_R1' &
sample_names(megan_genus)!='Rib-PF-4_S106_R1' &
sample_names(megan_genus)!='Broomfield-2_S50_R1')
megan_GO_5 <- subset_samples(megan_GO_5, sample_names(megan_GO_5)!='Lynchs-PF-1_S107_R1' &
sample_names(megan_GO_5)!='Lynchs-PF-2_S108_R1' &
sample_names(megan_GO_5)!='Lynchs-PF-3_S109_R1' &
sample_names(megan_GO_5)!='Lynchs-PF-4_S110_R1' &
sample_names(megan_GO_5)!='Myrmidon-PF-1_S111_R1' &
sample_names(megan_GO_5)!='Myrmidon-PF-2_S112_R1' &
sample_names(megan_GO_5)!='Myrmidon-PF-3_S113_R1' &
sample_names(megan_GO_5)!='Myrmidon-PF-4_S114_R1' &
sample_names(megan_GO_5)!='Rib-PF-1_S103_R1' &
sample_names(megan_GO_5)!='Rib-PF-2_S104_R1' &
sample_names(megan_GO_5)!='Rib-PF-3_S105_R1' &
sample_names(megan_GO_5)!='Rib-PF-4_S106_R1' &
sample_names(megan_GO_5)!='Broomfield-2_S50_R1')
megan_GO_4 <- subset_samples(megan_GO_4, sample_names(megan_GO_4)!='Lynchs-PF-1_S107_R1' &
sample_names(megan_GO_4)!='Lynchs-PF-2_S108_R1' &
sample_names(megan_GO_4)!='Lynchs-PF-3_S109_R1' &
sample_names(megan_GO_4)!='Lynchs-PF-4_S110_R1' &
sample_names(megan_GO_4)!='Myrmidon-PF-1_S111_R1' &
sample_names(megan_GO_4)!='Myrmidon-PF-2_S112_R1' &
sample_names(megan_GO_4)!='Myrmidon-PF-3_S113_R1' &
sample_names(megan_GO_4)!='Myrmidon-PF-4_S114_R1' &
sample_names(megan_GO_4)!='Rib-PF-1_S103_R1' &
sample_names(megan_GO_4)!='Rib-PF-2_S104_R1' &
sample_names(megan_GO_4)!='Rib-PF-3_S105_R1' &
sample_names(megan_GO_4)!='Rib-PF-4_S106_R1' &
sample_names(megan_GO_4)!='Broomfield-2_S50_R1')
megan_GO_3 <- subset_samples(megan_GO_3, sample_names(megan_GO_3)!='Lynchs-PF-1_S107_R1' &
sample_names(megan_GO_3)!='Lynchs-PF-2_S108_R1' &
sample_names(megan_GO_3)!='Lynchs-PF-3_S109_R1' &
sample_names(megan_GO_3)!='Lynchs-PF-4_S110_R1' &
sample_names(megan_GO_3)!='Myrmidon-PF-1_S111_R1' &
sample_names(megan_GO_3)!='Myrmidon-PF-2_S112_R1' &
sample_names(megan_GO_3)!='Myrmidon-PF-3_S113_R1' &
sample_names(megan_GO_3)!='Myrmidon-PF-4_S114_R1' &
sample_names(megan_GO_3)!='Rib-PF-1_S103_R1' &
sample_names(megan_GO_3)!='Rib-PF-2_S104_R1' &
sample_names(megan_GO_3)!='Rib-PF-3_S105_R1' &
sample_names(megan_GO_3)!='Rib-PF-4_S106_R1' &
sample_names(megan_GO_3)!='Broomfield-2_S50_R1')
# Checking the object again
megan_genus
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2066 taxa and 194 samples ]
## sample_data() Sample Data: [ 194 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 2066 taxa by 7 taxonomic ranks ]
megan_GO_5
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 7476 taxa and 194 samples ]
## sample_data() Sample Data: [ 194 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 7476 taxa by 6 taxonomic ranks ]
megan_GO_4
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 5257 taxa and 194 samples ]
## sample_data() Sample Data: [ 194 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 5257 taxa by 4 taxonomic ranks ]
megan_GO_3
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 706 taxa and 194 samples ]
## sample_data() Sample Data: [ 194 samples by 40 sample variables ]
## tax_table() Taxonomy Table: [ 706 taxa by 3 taxonomic ranks ]
# These samples still include the 3 negative controls
After removing the non pre-filtered samples, further data filtering included removal of reads (1) annotated as eukaryotic or viral; and (2) rare/spurious reads. Data was then Center-Log-Ratio (hereinafter ‘CLR’) transformed for statistical analysis in the mixOmics R package.
We annotated a total of 1919 microbial taxa (lowest category: genus level). Reads that were annotated as Eukarya (729 taxa in total) and viruses (11 viral annotations) were excluded from the analysis. Further analysis was performed on a phyloseq object with prokaryotic annotations only, a total of 1179 bacterial and archaeal groups (Figure 2, Table 1).
# Before plotting the bar plots, I first need to prepare my objects
### Taxonomy info | at 'Genus' level
megan_genus_all <- megan_genus
megan_genus_TAX_all <- as.data.frame(megan_genus_all@tax_table)
# Plot admixture barplot - Domain level (and viruses)
cols_domain <- c(
"d__Archaea" = "slategray3", # Archaea
"d__Bacteria" = "grey45", # Bacteria
"d__Eukaryota" = "salmon", # Eukaryota
"f__Mimiviridae" = "violetred", # Family Mimiviridae
"f__Phycodnaviridae" = "steelblue3", # Family Phycodnaviridae
"f__Retroviridae" = "lightsteelblue4", # Family Retroviridae
"o__Caudovirales" = "seashell4" # Order Caudovirales
)
DOMAIN <- as.data.frame(megan_genus_all@otu_table) %>%
rownames_to_column("OTUs") %>% # I will need this later to add taxonomy info
left_join(megan_genus_TAX_all %>% rownames_to_column("OTUs")) %>% # adding taxonomy info
column_to_rownames("OTUs") %>%
group_by(Rank1) %>%
# Keeping only numerical values now
summarise_if(.predicate = function(x) is.numeric(x),
.funs = funs(sum)) # Computing sums
# Now relative abundances
DOMAIN_RA <- DOMAIN
for (i in 2:(ncol(DOMAIN_RA))) {
DOMAIN_RA[i] <- DOMAIN_RA[i] / sum(DOMAIN_RA[i])
}
barplots_domain <- DOMAIN_RA %>%
column_to_rownames("Rank1") %>%
t() %>%
as.data.frame() %>%
rownames_to_column("Sample_ID") %>%
reshape2::melt() %>%
left_join(metadata %>% rownames_to_column("Sample_ID")) %>%
# Plotting now!
ggplot(aes(x=Sample_ID, y=value, fill=variable))+
geom_bar(stat = "identity")+
scale_y_continuous(expand = c(0,0))+
facet_wrap(~Sampling_trip, scales = "free", nrow = 5)+
# facet_grid(~Sampling_trip, scales = "free_x", space = "free")+
scale_fill_manual(values = cols_domain)+
ylab("Relative abundance of taxa (at Domain level)")+
xlab("Reef sites")+
theme(axis.text.x = # element_blank(),
element_text(angle = 75, hjust = 1, size = 12),
#axis.ticks.x = element_blank(),
#axis.title.x = element_blank(),
strip.text = element_text(colour="black", size=12),
panel.grid = element_blank(),
panel.background = element_blank(),
legend.position = "right",
legend.title = element_blank(),
legend.text = element_text(size = 12))
barplots_domain
We see that we have eukaryotic reads, let’s see how many taxa?
megan_genus_bacteria <- subset_taxa(megan_genus, # Phyloseq object with all OTUs
Rank1=="d__Bacteria") # The phyloseq object with raw counts
megan_genus_archaea <- subset_taxa(megan_genus, # Phyloseq object with all OTUs
Rank1=="d__Archaea") # The phyloseq object with raw counts
megan_genus_PROKS <- merge_phyloseq(megan_genus_bacteria,
megan_genus_archaea) # Phyloseq object with Proks only
megan_genus_EUKS <- subset_taxa(megan_genus_all,
Rank1=="d__Eukaryota") # Phyloseq object with Euks only
knitr::kable(as.data.frame(cbind(as.character(ntaxa(megan_genus_EUKS)), as.character(ntaxa(megan_genus_bacteria)), as.character(ntaxa(megan_genus_archaea)), as.character(ntaxa(megan_genus_PROKS)))), caption = "Taxonomic breakdown", col.names = c("Eukaryota", "Bacteria", "Archaea", "Prokarya"))
| Eukaryota | Bacteria | Archaea | Prokarya |
|---|---|---|---|
| 774 | 1212 | 45 | 1257 |
If we compare with abundances of prokaryotes (which are the target for this study), are there any euks that are highly abundant?
megan_genus_all_with_euks <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/IMOS-MGD_Seawater_full_dataset_Genera_191_samples_Neg_controls_July_2024.biom")
# Merging!
sample_data(megan_genus_all_with_euks) <- sample_data(metadata_neg_controls)
# Removing the PF samples and Broomfield 2 (seq failed for this rep, and we have the repeated sample for rep 2)
megan_genus_all_with_euks <- subset_samples(megan_genus_all_with_euks, sample_names(megan_genus_all_with_euks)!='Lynchs-PF-1_S107_R1' &
sample_names(megan_genus_all_with_euks)!='Lynchs-PF-2_S108_R1' &
sample_names(megan_genus_all_with_euks)!='Lynchs-PF-3_S109_R1' &
sample_names(megan_genus_all_with_euks)!='Lynchs-PF-4_S110_R1' &
sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-1_S111_R1' &
sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-2_S112_R1' &
sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-3_S113_R1' &
sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-4_S114_R1' &
sample_names(megan_genus_all_with_euks)!='Rib-PF-1_S103_R1' &
sample_names(megan_genus_all_with_euks)!='Rib-PF-2_S104_R1' &
sample_names(megan_genus_all_with_euks)!='Rib-PF-3_S105_R1' &
sample_names(megan_genus_all_with_euks)!='Rib-PF-4_S106_R1' &
sample_names(megan_genus_all_with_euks)!='Broomfield-2_S50_R1')
# Removing the non-annotated stuff!
megan_genus_all_anno_only <- subset_taxa(megan_genus_all_with_euks, Rank2!="NA")
# Getting relative abundances too
megan_genus_all_RA = transform_sample_counts(megan_genus_all_with_euks, function(x) x / sum(x) )
# Selecting the top 100 most abundant MAGs (based on RA data)
megan_genus_top200_RA_abund_with_euks <- taxa_sums(megan_genus_all_RA) %>%
sort(decreasing = TRUE) %>%
head(200) %>% # Taking the first X most abundant taxa.
# Change the number depending on how many Genera I want to look at
names()
# Making a new phyloseq object
megan_genus_top200_RA_with_euks <- prune_taxa(megan_genus_top200_RA_abund_with_euks, # These are the top 20
megan_genus_all_RA)
# Defining breaks - to make sure even very lowly abundant taxa will be visible!
# From Steve:
breaks=c(0,0.001,0.01,0.05,0.1,0.25,0.4,0.5,0.6,0.7,1)
# But I want to have less breaks
# breaks_5=c(0,0.001,0.1,0.3,0.7,1)
# Plot heatmap
left_join(otu_table(megan_genus_top200_RA_with_euks) %>% as.data.frame %>% rownames_to_column("OTU"),
tax_table(megan_genus_top200_RA_with_euks) %>% as.data.frame %>% rownames_to_column("OTU")) %>%
arrange(match(OTU, megan_genus_top200_RA_abund_with_euks)) %>% # Arranging by abundances here
unite(taxonomy, c(OTU, Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") %>% # Adding Taxonomy info
gather(Sample_ID, Reads, -taxonomy) %>% # 'Reads' contains the Raw counts
# left_join(as.data.frame(sample_data(megan_genus_all_RA)) %>% rownames_to_column("Sample_ID")) %>% # Now joining with the metadata
left_join(metadata %>% rownames_to_column("Sample_ID")) %>%
# Ready to plot now!
ggplot(aes(x = Sample_ID, # Short reef names on the x axis
y = reorder(taxonomy, # Taxonomy info on the y axis
Reads), # With Taxa ordered based on abundances, most abundant listed first
fill = Reads)) + # Change to 'Reads' if plotting the raw counts
geom_tile() + # This colors the heatmap in blue & makes the more abundant taxa darker in color
facet_grid(cols = vars(Sampling_trip), scales = "free_x", space = "free") + # Splitting in facets
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) + # Rotating the text at 90 degrees angle
# Use this if I want a smaller number of breaks
# scale_fill_stepsn(breaks = breaks_5, colours =c("white", # for the 0-0.001 RA range!
# "slategray1", "slategray2", "slategray3", "slategray4")) +
# Or from Steve:
scale_fill_stepsn(breaks = breaks, colours =c("white", # for the 0-0.001 RA range!
"slategray4", # 001 - 0.01
"slategray3", # 0.01 - 0.05
"slategray2", # 0.05 - 0.1
"navajowhite", # 0.1 - 0.25
"rosybrown2", # 0.25,0.4
"lightsalmon", # 0.4 - 0.5
"rosybrown1", # 0.5 - 0.6
# "lightgoldenrod1", # 0.6 - 0.7
"indianred2")) + # 0.7,1
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0))
Prior to removal of rare and spurious reads, non-annotated reads were removed from the dataset. We then computed relative abundance values (RA) and removed reads with average RA < 0.0001% across samples. After removing OTUs that were less than 0.0001% abundant, we retained 618 taxa (primarily at Genus level) out of the initial 1179 prokaryotic OTUs. At functional level, we retained 5015 GO annotations (out of 8689 GO terms).
### IMPORTANT - ***Change this part of the script*** depending on which phyloseq object I would like to look at: prokaryotic, eukaryotic or all. This way I wouldn't need to modify multiple lines in the script below
megan_genus <- megan_genus_PROKS # options to choose from: megan_genus_PROKS, megan_genus_EUKS
# Cleaning the names here already! This way I will make sure every other phyloseq object will have organised taxonomy
megan_genus_TAX_PROKS <- as.data.frame(megan_genus@tax_table)
# Unite the names within one column called "Taxonomy"
megan_genus_TAX_PROKS <- megan_genus_TAX_PROKS %>%
unite(Taxonomy, c(Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") # Adding Taxonomy info
# Initialize empty columns
megan_genus_TAX_PROKS$Domain <- NA
megan_genus_TAX_PROKS$Phylum <- NA
megan_genus_TAX_PROKS$Class <- NA
megan_genus_TAX_PROKS$Order <- NA
megan_genus_TAX_PROKS$Family <- NA
megan_genus_TAX_PROKS$Genus <- NA
megan_genus_TAX_PROKS$Species <- NA
# Categorise taxonomic strings based on patterns:
megan_genus_TAX_PROKS$Domain <- str_match(megan_genus_TAX_PROKS$Taxonomy, "^d__(.+?);")[, 2]
megan_genus_TAX_PROKS$Phylum <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; p__(.+?);")[, 2]
megan_genus_TAX_PROKS$Class <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; c__(.+?);")[, 2]
megan_genus_TAX_PROKS$Order <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; o__(.+?);")[, 2]
megan_genus_TAX_PROKS$Family <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; f__(.+?);")[, 2]
megan_genus_TAX_PROKS$Genus <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; g__(.+?);")[, 2]
megan_genus_TAX_PROKS$Species <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; s__(.+?);")[, 2]
# Last thing: replacing missing values with "Unknown_*"
megan_genus_TAX_PROKS$Domain[is.na(megan_genus_TAX_PROKS$Domain)] <- "NA"
megan_genus_TAX_PROKS$Phylum[is.na(megan_genus_TAX_PROKS$Phylum)] <- "NA"
megan_genus_TAX_PROKS$Class[is.na(megan_genus_TAX_PROKS$Class)] <- "NA"
megan_genus_TAX_PROKS$Order[is.na(megan_genus_TAX_PROKS$Order)] <- "NA"
megan_genus_TAX_PROKS$Family[is.na(megan_genus_TAX_PROKS$Family)] <- "NA"
megan_genus_TAX_PROKS$Genus[is.na(megan_genus_TAX_PROKS$Genus)] <- "NA"
megan_genus_TAX_PROKS$Species[is.na(megan_genus_TAX_PROKS$Species)] <- "NA"
# Remove the original taxonomy column
megan_genus_TAX_PROKS <- megan_genus_TAX_PROKS %>%
dplyr::select(Domain, Phylum, Class, Order, Family, Genus, Species)
# All cleaned up! :) thanks ChatGPT
### Putting this back into the phyloseq object:
# First checking the current taxonomic names in phyloseq object
current_taxa_names <- taxa_names(megan_genus)
# Compare with tax_table column names and order
polished_tax_table <- colnames(t(megan_genus_TAX_PROKS))
# Check if they match
if (!identical(current_taxa_names, polished_tax_table)) {
stop("Polished taxonomic names in megan_genus_TAX_PROKS do not match the taxa_names in the megan_genus phyloseq object.")
}
# Looks like they match! So I'm not sure why I cannot merge them (code below)
# Check dimensions of tax_table and physeq - does the number of rows match?
nrow_tax_table <- nrow(megan_genus_TAX_PROKS)
ntaxa_physeq <-ntaxa(megan_genus) # Number of taxa in physeq
if (nrow_tax_table != ntaxa_physeq) {
stop("Number of rows in megan_genus_TAX_PROKS does not match the number of taxa in megan_genus phyloseq object.")
}
# Step 3: Compare order of unique values
identical(megan_genus_TAX_PROKS %>% # Looking for OTU order for the polishes taxa
rownames_to_column("OTUs") %>%
dplyr::select("OTUs"),
megan_genus@tax_table %>% # Looking for OTU order in the current phyloseqq object
as.data.frame() %>%
rownames_to_column("OTUs") %>%
dplyr::select("OTUs"))
## [1] TRUE
# Again, this is also the same
# Here too:
identical(row.names(megan_genus_TAX_PROKS),
row.names(otu_table(megan_genus))
)
## [1] TRUE
# Yes, the row names are identical
# Now adding this polished taxonomy to my phyloseq object:
tax_table(megan_genus) <- as.matrix(megan_genus_TAX_PROKS)
### Removing the negative controls too - taxa phyloseq object:
megan_genus_no_neg_control <- subset_samples(megan_genus,
sample_names(megan_genus)!='Neg-control-1_S101_R1' &
sample_names(megan_genus)!='Neg-control-2_S24_R1' &
sample_names(megan_genus)!='Neg-control-3_S116_R1')
### Removing the negative controls too - GOs at rank 5 phyloseq object:
megan_GO_5_no_neg_control <- subset_samples(megan_GO_5,
sample_names(megan_GO_5)!='Neg-control-1_S101_R1' &
sample_names(megan_GO_5)!='Neg-control-2_S24_R1' &
sample_names(megan_GO_5)!='Neg-control-3_S116_R1')
### Removing the negative controls too - GOs at rank 4 phyloseq object:
megan_GO_4_no_neg_control <- subset_samples(megan_GO_4,
sample_names(megan_GO_4)!='Neg-control-1_S101_R1' &
sample_names(megan_GO_4)!='Neg-control-2_S24_R1' &
sample_names(megan_GO_4)!='Neg-control-3_S116_R1')
### Removing the negative controls too - GOs at rank 3 phyloseq object:
megan_GO_3_no_neg_control <- subset_samples(megan_GO_3,
sample_names(megan_GO_3)!='Neg-control-1_S101_R1' &
sample_names(megan_GO_3)!='Neg-control-2_S24_R1' &
sample_names(megan_GO_3)!='Neg-control-3_S116_R1')
### Instead of setting an arbitrary threshold (e.g 100 seqs), I would like to filter based on relative abundances (***removing all OTUs < 0.0001% rel. abundance***)
# Tutorial I used: https://joey711.github.io/phyloseq/preprocess.html
### Removing reads that are annotated at Bacteria or Archaea levels only - not informative!
megan_genus_anno_only <- subset_taxa(megan_genus_no_neg_control, Phylum!="NA")
### Removing reads that were not annotated at Rank2 level - not informative!
megan_GO_5_anno_only <- subset_taxa(megan_GO_5_no_neg_control, Rank2!="NA")
megan_GO_4_anno_only <- subset_taxa(megan_GO_4_no_neg_control, Rank2!="NA")
megan_GO_3_anno_only <- subset_taxa(megan_GO_3_no_neg_control, Rank2!="NA")
# Getting the taxa data frame
megan_genus_TAX <- as.data.frame(megan_genus_anno_only@tax_table)
# Getting the taxa data frame
megan_GO_5_FUN <- as.data.frame(megan_GO_5_anno_only@tax_table)
megan_GO_4_FUN <- as.data.frame(megan_GO_4_anno_only@tax_table)
megan_GO_3_FUN <- as.data.frame(megan_GO_3_anno_only@tax_table)
### Getting the relative abundances
# Taxa
megan_genus_RA = transform_sample_counts(megan_genus_anno_only, function(x) x / sum(x) )
# GO terms
megan_GO_5_RA = transform_sample_counts(megan_GO_5_anno_only, function(x) x / sum(x) )
megan_GO_4_RA = transform_sample_counts(megan_GO_4_anno_only, function(x) x / sum(x) )
megan_GO_3_RA = transform_sample_counts(megan_GO_3_anno_only, function(x) x / sum(x) )
# removing all OTUs that are less than 0.0001% abundant
megan_genus_RA_no_rare = filter_taxa(megan_genus_RA, function(x) mean(x) > 1e-6, TRUE)
# removing all genes that are less than 0.0001% abundant
megan_GO_5_RA_no_rare = filter_taxa(megan_GO_5_RA, function(x) mean(x) > 1e-6, TRUE)
megan_GO_3_RA_no_rare = filter_taxa(megan_GO_3_RA, function(x) mean(x) > 1e-6, TRUE)
megan_GO_4_RA_no_rare = filter_taxa(megan_GO_4_RA, function(x) mean(x) > 1e-6, TRUE)
Before_after_filtering <- cbind(rbind(ntaxa(megan_genus), ntaxa(megan_genus_RA_no_rare)),
rbind(ntaxa(megan_GO_5), ntaxa(megan_GO_5_RA_no_rare))
# rbind(ntaxa(megan_COGs), ntaxa(megan_COGs_RA_no_rare))
) %>%
as.data.frame()
# Adding row names now
row.names(Before_after_filtering) <- c("Before filtering", "After filtering")
knitr::kable(Before_after_filtering, caption = "Removal of Rare/Spurious reads (< 0.0001% RA)", col.names = c("Taxa", "GO terms"), row.names = T)
| Taxa | GO terms | |
|---|---|---|
| Before filtering | 1257 | 7476 |
| After filtering | 621 | 4287 |
megan_genus_abundant <- prune_taxa(taxa_names(megan_genus_RA_no_rare), # List of OTUs after filtering
megan_genus_no_neg_control) # My phyloseq object with raw counts
megan_GO_5_abundant <- prune_taxa(taxa_names(megan_GO_5_RA_no_rare), # List of OTUs after filtering
megan_GO_5_no_neg_control) # My phyloseq object with raw counts
megan_GO_4_abundant <- prune_taxa(taxa_names(megan_GO_4_RA_no_rare), # List of OTUs after filtering
megan_GO_4_no_neg_control) # My phyloseq object with raw counts
megan_GO_3_abundant <- prune_taxa(taxa_names(megan_GO_3_RA_no_rare), # List of OTUs after filtering
megan_GO_3_no_neg_control) # My phyloseq object with raw counts
# CLR is the normalisation method suggested by the mixOmics R package for microbial data - a way to address missing values that are characteristic of microbial datasets. I need to remove missing values before doing the CLR normalisation - The geometric mean cannot be determined for sparse data without deleting, replacing or estimating the 0 count values. So I am introducing pseudo counts
### Tutorial used: http://mixomics.org/mixmc/mixmc-preprocessing/
# Checking if there are any zeros - BEFORE adding pseudocounts
sum(which(megan_genus_abundant@otu_table == 0))
## [1] 4791141201
sum(which(megan_GO_5_abundant@otu_table == 0))
## [1] 43240551782
# sum(which(megan_COGs_abundant@otu_table == 0))
# Pseudocounts - replacing all zero vals with 1;
megan_genus_abundant@otu_table <- megan_genus_abundant@otu_table + 1
megan_GO_5_abundant@otu_table <- megan_GO_5_abundant@otu_table +1
megan_GO_3_abundant@otu_table <- megan_GO_3_abundant@otu_table +1
megan_GO_4_abundant@otu_table <- megan_GO_4_abundant@otu_table +1
# megan_COGs_abundant@otu_table <- megan_COGs_abundant@otu_table + 1
# Checking if there are any zeros - AFTER adding pseudocounts
sum(which(megan_genus_abundant@otu_table == 0))
## [1] 0
sum(which(megan_GO_5_abundant@otu_table == 0))
## [1] 0
# sum(which(megan_COGs_abundant@otu_table == 0))
# All good! No NAs after introducing pseudocounts
### Now I can CLR transform when running analyses in mixOmics!
# I am using an option from the microbiome R package, not the same as in MixOmics.
megan_genus_clr <- microbiome::transform(megan_genus_abundant, "clr")
megan_go_clr_5 <- microbiome::transform(megan_GO_5_abundant, "clr")
megan_go_clr_3 <- microbiome::transform(megan_GO_3_abundant, "clr")
megan_go_clr_4 <- microbiome::transform(megan_GO_4_abundant, "clr")
# megan_COGs_clr <- microbiome::transform(megan_COGs_abundant, "clr")
# megan_go_clr_3_bp <- megan_go_clr_3 %>%
# subset_taxa(Rank2 == 'GO:0008150 biological_process')
# megan_go_clr_4_bp <- megan_go_clr_4 %>%
# subset_taxa(Rank2 == 'GO:0008150 biological_process')
# But for GO at lvl 3, I only want bio process
# save.image("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/Code_for_Terzin_et_al_Microbial_Function_Outperforms_Taxonomy_in_Inferring_Water_Chemistry_across_the_Great_Barrier_Reef.RData")
# Preparing the object to have taxa names on boxplots
OTUs_biplot <- as.data.frame(megan_genus_clr@otu_table) %>%
t() # mixOmics needs samples and microbes to be reordered, so transposing here
# Check dimensions of data
dim(OTUs_biplot)
## [1] 191 561
class(OTUs_biplot)
## [1] "matrix" "array"
# Getting taxa names
OTUs_biplot_colnames_for_biplot <- left_join(otu_table(megan_genus_clr) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_clr) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
unite(taxonomy, c(Family, Genus), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
OTUs_biplot_colnames_for_biplot <- OTUs_biplot_colnames_for_biplot %>%
dplyr::select("OTU", "taxonomy")
# Merging with the OTUs_biplot object
OTUs_biplot_names <- left_join(t(OTUs_biplot) %>%
as.data.frame() %>%
rownames_to_column("OTU"),
OTUs_biplot_colnames_for_biplot) %>%
unite(Annotations, c(OTU, taxonomy), sep = "_") %>%
column_to_rownames("Annotations") %>% # moving this as rowposing back into the right format
t() # trans
# PCA
result.pca.taxa.names <- pca(OTUs_biplot_names)
# Plotting the PCA sample plot
plotIndiv(result.pca.taxa.names,
group = sample_data(megan_genus_abundant)$Sampling_trip,
title = 'PCA | Microbial Taxonomy',
legend = T,
ellipse = TRUE,
ind.names = F,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend.title = 'Sampling trip'
)
# Plotting the PCA biplot
biplot(result.pca.taxa.names,
comp = c(1, 2),
group = sample_data(megan_genus_abundant)$Sampling_trip,
ind.names = F,
ellipse = T,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend = TRUE,
vline = T,
hline = T,
cutoff = 0.65,
legend.title = "Sampling trip")
# Parameter tuning
tune.pca.taxa <- tune.pca(OTUs_biplot_names, ncomp = 10, scale = TRUE)
plot(tune.pca.taxa)
Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial taxonomy): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.
# Numerical output
pca.taxa.num <- pca(OTUs_biplot_names, # getting the numerical values only
ncomp = 10,
center = TRUE,
scale = TRUE)
# Explained variance per PCA component
knitr::kable(pca.taxa.num$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
| x | |
|---|---|
| PC1 | 0.2030513 |
| PC2 | 0.1235319 |
| PC3 | 0.0686204 |
| PC4 | 0.0537983 |
| PC5 | 0.0456617 |
| PC6 | 0.0431242 |
| PC7 | 0.0303674 |
| PC8 | 0.0226126 |
| PC9 | 0.0216532 |
| PC10 | 0.0178213 |
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.taxa.num$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
| x | |
|---|---|
| PC1 | 0.2030513 |
| PC2 | 0.3265833 |
| PC3 | 0.3952037 |
| PC4 | 0.4490020 |
| PC5 | 0.4946637 |
| PC6 | 0.5377879 |
| PC7 | 0.5681553 |
| PC8 | 0.5907679 |
| PC9 | 0.6124211 |
| PC10 | 0.6302423 |
# Preparing the object to have taxa names on boxplots
GOs_biplot <- as.data.frame(megan_go_clr_5@otu_table) %>%
t() # mixOmics needs samples and microbes to be reordered, so transposing here
# Check dimensions of data
dim(GOs_biplot)
## [1] 191 4287
class(GOs_biplot)
## [1] "matrix" "array"
# Getting gene names
GOs_biplot_colnames_for_biplot <- left_join(otu_table(megan_go_clr_5) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_go_clr_5) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
unite(Gene_annotations, c(Rank4), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
GOs_biplot_colnames_for_biplot <- GOs_biplot_colnames_for_biplot %>%
dplyr::select("OTU", "Gene_annotations")
# Merging with the OTUs_biplot object
GOs_biplot_names <- left_join(t(GOs_biplot) %>%
as.data.frame() %>%
rownames_to_column("OTU"),
GOs_biplot_colnames_for_biplot) %>%
unite(Annotations, c(OTU, Gene_annotations), sep = "_") %>%
column_to_rownames("Annotations") %>% # moving this as rowposing back into the right format
t() # trans
# PCA
result.pca.GOs.names <- pca(GOs_biplot_names)
# Plotting the PCA sample plot
plotIndiv(result.pca.GOs.names,
group = sample_data(megan_GO_5_abundant)$Sampling_trip,
title = 'PCA | Microbial Functions',
legend = T,
ellipse = TRUE,
ind.names = F,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend.title = 'Sampling trip'
)
# Plotting the PCA biplot
biplot(result.pca.GOs.names,
comp = c(1, 2),
group = sample_data(megan_GO_5_abundant)$Sampling_trip,
ind.names = F,
ellipse = T,
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
legend = TRUE,
vline = T,
hline = T,
cutoff = 0.95,
legend.title = "Sampling trip")
# Parameter tuning
tune.pca.GOs <- tune.pca(GOs_biplot_names, ncomp = 10, scale = TRUE)
plot(tune.pca.GOs)
Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial genes - GO terms): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.
# Numerical output
pca.GOs.num <- pca(GOs_biplot_names, # getting the numerical values only
ncomp = 10,
center = TRUE,
scale = TRUE)
# Explained variance per PCA component
knitr::kable(pca.GOs.num$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
| x | |
|---|---|
| PC1 | 0.3436569 |
| PC2 | 0.1184468 |
| PC3 | 0.0592101 |
| PC4 | 0.0477584 |
| PC5 | 0.0386741 |
| PC6 | 0.0292849 |
| PC7 | 0.0203148 |
| PC8 | 0.0186514 |
| PC9 | 0.0143016 |
| PC10 | 0.0118723 |
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.GOs.num$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
| x | |
|---|---|
| PC1 | 0.3436569 |
| PC2 | 0.4621037 |
| PC3 | 0.5213138 |
| PC4 | 0.5690722 |
| PC5 | 0.6077463 |
| PC6 | 0.6370312 |
| PC7 | 0.6573461 |
| PC8 | 0.6759974 |
| PC9 | 0.6902990 |
| PC10 | 0.7021713 |
Parameter tuning in mixOmics to identify the optimal number of principal components (PCs) showed that the variance explained by adding more than 2 PCs is insignificant for both taxonomy and function. Hence, 2 PCs were retained. PCA clustering identified a clear difference between summer and winter samples, for both taxonomy and function. However, this clustering becomes more evident at functional compared to taxonomic levels. The percentage of variance explained by the first 2 Principal components (PCs) equaled to ~26% for taxonomy, and 55% for functions (GO terms). A PERMANOVA test was then carried out to investigate which comparisons are statistically significant.
Analysis of similarities (ANOSIM) testing whether there is a statistically significant difference between two or more groups of sampling units - sampling trips. We will then perform a Pairwise PERMANOVA.
taxa.anosim <- left_join(otu_table(megan_genus_RA_no_rare) %>%
as.data.frame %>%
rownames_to_column("OTU"),
megan_genus_TAX %>%
rownames_to_column("OTU")) %>%
unite(taxonomy, c(OTU, Domain, Phylum, Class, Order, Family, Genus, Species), sep = "; ") %>%
column_to_rownames("taxonomy")
# Removing rows with NAs, because ANOSIM does not take in missing vals
taxa.anosim <- na.omit(taxa.anosim)
# Object is ready to perform the test
ano_taxa <- anosim(t(taxa.anosim),
sample_data(megan_genus_RA_no_rare)$Sampling_trip,
distance = "bray",
permutations = 9999)
# Results
ano_taxa
##
## Call:
## anosim(x = t(taxa.anosim), grouping = sample_data(megan_genus_RA_no_rare)$Sampling_trip, permutations = 9999, distance = "bray")
## Dissimilarity: bray
##
## ANOSIM statistic R: 0.2244
## Significance: 1e-04
##
## Permutation: free
## Number of permutations: 9999
Pairwise PERMANOVA - taxa
Phylum level
Out of the 29 bacterial and archaeal phyla we identified, the most abundant phyla were Cyanobacteria, Proteobacteria, and Bacteroidetes, respectively. Bacteroidetes increased in abundance for those samples collected during the peak of summer (February 2020), and were lowest in abundances during winter (July 2020).
# Before plotting the bar plots, I first need to prepare my objects
taxa.barplots_phylum <- left_join(otu_table(megan_genus_abundant) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_abundant) %>%
as.data.frame %>%
rownames_to_column("OTU"))
## Joining with `by = join_by(OTU)`
# Now setting OTUs as row names
rownames(taxa.barplots_phylum) <- taxa.barplots_phylum[,1]
taxa.barplots_phylum[,1] <- NULL
# Now summarising raw counts at phylum level
taxa.barplots_phylum.sum <- ddply(taxa.barplots_phylum, "Phylum", numcolwise(sum))
# Ready to compute raw abundances per sample - by dividing cell value with column sum
taxa.barplots_phylum.sum_RA <- taxa.barplots_phylum.sum
for (i in 2:(ncol(taxa.barplots_phylum.sum))) { # '2:' as the first column is not numeric
taxa.barplots_phylum.sum_RA[i] <- taxa.barplots_phylum.sum_RA[i] / sum(taxa.barplots_phylum.sum_RA[i])
}
# taxa.barplots_phylum.sum_RA
# Now setting row names - RA
taxa.barplots_phylum.sum_RA <- taxa.barplots_phylum.sum_RA %>% remove_rownames %>% column_to_rownames(var="Phylum")
taxa.barplots_phylum_transposed_RA <- t(taxa.barplots_phylum.sum_RA)
# Now setting back the col names before melting - RA
taxa.barplots_phylum_transposed_RA <- tibble::rownames_to_column(as.data.frame(taxa.barplots_phylum_transposed_RA), "Sample_ID")
taxa.barplots_phylum_transposed_melt_RA = reshape2::melt(taxa.barplots_phylum_transposed_RA, id.vars=c("Sample_ID"))
# At last, we add metadata_barplots info so that I can make facets in the plot - RA
taxa.barplots_phylum_transposed_melt_RA <- left_join(taxa.barplots_phylum_transposed_melt_RA,
metadata %>% rownames_to_column("Sample_ID"))
## Joining with `by = join_by(Sample_ID)`
# Setting colors - hardcoding!
cols_phyla <- c(
"Acidobacteria" = "skyblue2", # Acidobacteria
"Actinobacteria" = "slateblue4", # Actinobacteria
"Bacteroidetes" = "salmon1", # Bacteroidetes
"Balneolaeota" = "plum2", # Balneolaeota
"Candidatus Kaiserbacteria" = "skyblue1", # Candidatus Kaiserbacteria
"Candidatus Marinimicrobia" = "slategray4", # Candidatus Marinimicrobia
"Candidatus Peregrinibacteria" = "lavenderblush3", # Candidatus Peregrinibacteria
"Candidatus Tectomicrobia" = "tomato1", # Candidatus Tectomicrobia
"Candidatus Thermoplasmatota" = "lightgoldenrod1", # Candidatus Thermoplasmatota
"Chlamydiae" = "olivedrab", # Chlamydiae
"Chlorobi" = "seagreen3", # Chlorobi
"Chloroflexi" = "slateblue3", # Chloroflexi
"Cyanobacteria" = "darkseagreen3", # Cyanobacteria
"Deinococcus-Thermus" = "rosybrown4", # Deinococcus-Thermus
"Euryarchaeota" = "violetred1", # Euryarchaeota
"Fibrobacteres" = "navajowhite2", # Fibrobacteres
"Firmicutes" = "indianred", # Firmicutes
"Fusobacteria" = "skyblue3", # Fusobacteria
"Gemmatimonadetes" = "tomato2", # Gemmatimonadetes
"Lentisphaerae" = "lightyellow", # Lentisphaerae
"Nitrospinae" = "khaki", # Nitrospinae
"Nitrospirae" = "rosybrown", # Nitrospirae
"Planctomycetes" = "mediumpurple1", # Planctomycetes
"Proteobacteria" = "lightblue", # Proteobacteria
"Rhodothermaeota" = "tomato3", # Rhodothermaeota
"Spirochaetes" = "wheat1", # Spirochaetes
"Tenericutes" = "palegoldenrod", # Tenericutes
"Thaumarchaeota" = "plum4", # Thaumarchaeota
"Verrucomicrobia" = "tan1" # Verrucomicrobia
)
# Ready to plot
ggplot(data=taxa.barplots_phylum_transposed_melt_RA,
aes(x=Sample_ID,
y=value,
fill=variable))+
geom_bar(stat = "identity")+
scale_y_continuous(expand = c(0,0))+
facet_wrap(~Sampling_trip, scales = "free", nrow = 4)+
# scale_fill_manual(values = cols)+
# facet_wrap(~Sampling_trip, scales = "free", nrow = 4)+
# facet_grid(~Sampling_trip, scales = "free_x", space = "free")+
scale_fill_manual(values = cols_phyla)+
ylab("Relative abundance of taxa (at Phylum level)")+
xlab("Reef sites")+
theme(axis.text.x = element_text(angle = 75, hjust = 1, size = 12),
#axis.ticks.x = element_blank(),
#axis.title.x = element_blank(),
strip.text = element_text(colour="black", size=12),
panel.grid = element_blank(),
panel.background = element_blank(),
legend.position = "right",
legend.title = element_blank(),
legend.text = element_text(size = 12))
# ggarrange(admix.bar_data, admix.bar_data_RA,
# ncol = 1, nrow = 2)
# Group by mean using R Base aggregate()
phylum_mean <- aggregate(taxa.barplots_phylum_transposed_melt_RA$value, by=list(taxa.barplots_phylum_transposed_melt_RA$variable), FUN=mean)
# Checking that the row sums will be 1
sum(phylum_mean$x)
## [1] 1
# It worked
# Printing as table now, by sorting the values too.
knitr::kable(arrange(phylum_mean, desc(x)), caption = "Mean relative abundances at Phylum level, across all samples. We observe that 47.27% of the reads cannot be annotated bellow the Phylum level.")
| Group.1 | x |
|---|---|
| Cyanobacteria | 0.6817959 |
| Proteobacteria | 0.2612396 |
| Bacteroidetes | 0.0253382 |
| Actinobacteria | 0.0159741 |
| Planctomycetes | 0.0048641 |
| Firmicutes | 0.0039481 |
| Verrucomicrobia | 0.0021430 |
| Balneolaeota | 0.0018664 |
| Thaumarchaeota | 0.0006134 |
| Spirochaetes | 0.0005483 |
| Euryarchaeota | 0.0004341 |
| Candidatus Thermoplasmatota | 0.0002843 |
| Fusobacteria | 0.0001528 |
| Lentisphaerae | 0.0001510 |
| Rhodothermaeota | 0.0001400 |
| Tenericutes | 0.0001209 |
| Nitrospinae | 0.0000852 |
| Candidatus Tectomicrobia | 0.0000550 |
| Nitrospirae | 0.0000548 |
| Acidobacteria | 0.0000537 |
| Chloroflexi | 0.0000419 |
| Deinococcus-Thermus | 0.0000264 |
| Candidatus Peregrinibacteria | 0.0000184 |
| Chlamydiae | 0.0000158 |
| Candidatus Kaiserbacteria | 0.0000110 |
| Gemmatimonadetes | 0.0000106 |
| Fibrobacteres | 0.0000058 |
| Candidatus Marinimicrobia | 0.0000051 |
| Chlorobi | 0.0000020 |
OTUs_non_annotated_phyla_to_remove <- c("1869227", "2", "2157")
megan_genus_abundant_known_phyla_only <- subset_taxa(megan_genus_abundant,
!taxa_names(megan_genus_abundant) %in% OTUs_non_annotated_phyla_to_remove)
# Before plotting the bar plots, I first need to prepare my objects
taxa.barplots_phylum_known <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame() %>%
rownames_to_column("OTU")) %>%
column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at phylum level
taxa.barplots_phylum_known.sum <- ddply(taxa.barplots_phylum_known, "Phylum", numcolwise(sum))
# Ready to compute raw abundances per sample - by dividing cell value with column sum
taxa.barplots_phylum_known.sum_RA <- taxa.barplots_phylum_known.sum
for (i in 2:(ncol(taxa.barplots_phylum_known.sum))) { # '2:' as the first column is not numeric
taxa.barplots_phylum_known.sum_RA[i] <- taxa.barplots_phylum_known.sum_RA[i] / sum(taxa.barplots_phylum_known.sum_RA[i])
}
# taxa.barplots_phylum.sum_RA
# Now setting row names - RA
taxa.barplots_phylum_known.sum_RA <- taxa.barplots_phylum_known.sum_RA %>% remove_rownames %>% column_to_rownames(var="Phylum")
taxa.barplots_phylum_known_transposed_RA <- t(taxa.barplots_phylum_known.sum_RA)
# Now setting back the col names before melting - RA
taxa.barplots_phylum_known_transposed_RA <- tibble::rownames_to_column(as.data.frame(taxa.barplots_phylum_known_transposed_RA), "Sample_ID")
taxa.barplots_phylum_known_transposed_melt_RA = reshape2::melt(taxa.barplots_phylum_known_transposed_RA, id.vars=c("Sample_ID"))
# At last, we add metadata_barplots info so that I can make facets in the plot - RA
taxa.barplots_phylum_known_transposed_melt_RA <- left_join(taxa.barplots_phylum_known_transposed_melt_RA,
metadata %>% rownames_to_column("Sample_ID"))
## Joining with `by = join_by(Sample_ID)`
# Setting colors - hardcoding!
cols_phyla_known <- c(
"Acidobacteria" = "skyblue2", # Acidobacteria
"Actinobacteria" = "slateblue4", # Actinobacteria
"Bacteroidetes" = "salmon1", # Bacteroidetes
"Balneolaeota" = "plum2", # Balneolaeota
"Candidatus Kaiserbacteria" = "skyblue1", # Candidatus Kaiserbacteria
"Candidatus Marinimicrobia" = "slategray4", # Candidatus Marinimicrobia
"Candidatus Peregrinibacteria" = "lavenderblush3", # Candidatus Peregrinibacteria
"Candidatus Tectomicrobia" = "tomato1", # Candidatus Tectomicrobia
"Candidatus Thermoplasmatota" = "lightgoldenrod1", # Candidatus Thermoplasmatota
"Chlamydiae" = "olivedrab", # Chlamydiae
"Chlorobi" = "seagreen3", # Chlorobi
"Chloroflexi" = "slateblue3", # Chloroflexi
"Cyanobacteria" = "darkseagreen3", # Cyanobacteria
"Deinococcus-Thermus" = "rosybrown4", # Deinococcus-Thermus
"Euryarchaeota" = "violetred1", # Euryarchaeota
"Fibrobacteres" = "navajowhite2", # Fibrobacteres
"Firmicutes" = "indianred", # Firmicutes
"Fusobacteria" = "skyblue3", # Fusobacteria
"Gemmatimonadetes" = "tomato2", # Gemmatimonadetes
"Lentisphaerae" = "lightyellow", # Lentisphaerae
"Nitrospinae" = "khaki", # Nitrospinae
"Nitrospirae" = "rosybrown", # Nitrospirae
"Planctomycetes" = "mediumpurple1", # Planctomycetes
"Proteobacteria" = "lightblue", # Proteobacteria
"Rhodothermaeota" = "tomato3", # Rhodothermaeota
"Spirochaetes" = "wheat1", # Spirochaetes
"Tenericutes" = "palegoldenrod", # Tenericutes
"Thaumarchaeota" = "plum4", # Thaumarchaeota
"Verrucomicrobia" = "tan1" # Verrucomicrobia
)
# Ready to plot
ggplot(data=taxa.barplots_phylum_known_transposed_melt_RA,
aes(x=Sample_ID,
y=value,
fill=variable))+
geom_bar(stat = "identity")+
scale_y_continuous(expand = c(0,0))+
facet_wrap(~Sampling_trip, scales = "free", nrow = 4)+
# scale_fill_manual(values = cols)+
# facet_wrap(~Sampling_trip, scales = "free", nrow = 4)+
# facet_grid(~Sampling_trip, scales = "free_x", space = "free")+
scale_fill_manual(values = cols_phyla_known)+
ylab("Relative abundance of taxa (at Phylum level)")+
xlab("Reef sites")+
theme(axis.text.x = element_text(angle = 75, hjust = 1, size = 12),
#axis.ticks.x = element_blank(),
#axis.title.x = element_blank(),
strip.text = element_text(colour="black", size=12),
panel.grid = element_blank(),
panel.background = element_blank(),
legend.position = "right",
legend.title = element_blank(),
legend.text = element_text(size = 12))
# ggarrange(admix.bar_data, admix.bar_data_RA,
# ncol = 1, nrow = 2)
# Group by mean using R Base aggregate()
phylum_mean_per_trip <- aggregate(taxa.barplots_phylum_known_transposed_melt_RA$value, by=list(taxa.barplots_phylum_known_transposed_melt_RA$variable, taxa.barplots_phylum_known_transposed_melt_RA$Sampling_trip),
FUN=mean)
# Checking that the row sums will be 1
sum(phylum_mean_per_trip$x)
## [1] 4
# It worked
knitr::kable(dcast(phylum_mean_per_trip, Group.1 ~ Group.2, value.var = "x"), caption = "Mean relative abundances at Phylum level, partitioned per trip.")
| Group.1 | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | Trip_03_February_2020 | Trip_04_July_2020 |
|---|---|---|---|---|
| Acidobacteria | 0.0000002 | 0.0000789 | 0.0000600 | 0.0000691 |
| Actinobacteria | 0.0199151 | 0.0206680 | 0.0131711 | 0.0110066 |
| Bacteroidetes | 0.0231314 | 0.0179987 | 0.0539265 | 0.0114112 |
| Balneolaeota | 0.0030615 | 0.0018065 | 0.0023532 | 0.0006050 |
| Candidatus Kaiserbacteria | 0.0000002 | 0.0000429 | 0.0000003 | 0.0000004 |
| Candidatus Marinimicrobia | 0.0000211 | 0.0000002 | 0.0000003 | 0.0000004 |
| Candidatus Peregrinibacteria | 0.0000188 | 0.0000392 | 0.0000170 | 0.0000012 |
| Candidatus Tectomicrobia | 0.0000002 | 0.0000499 | 0.0000074 | 0.0001389 |
| Candidatus Thermoplasmatota | 0.0000304 | 0.0004387 | 0.0000276 | 0.0005486 |
| Chlamydiae | 0.0000007 | 0.0000599 | 0.0000010 | 0.0000012 |
| Chlorobi | 0.0000002 | 0.0000068 | 0.0000003 | 0.0000004 |
| Chloroflexi | 0.0000002 | 0.0000542 | 0.0000459 | 0.0000611 |
| Cyanobacteria | 0.6667602 | 0.6833643 | 0.6592973 | 0.7095411 |
| Deinococcus-Thermus | 0.0000250 | 0.0000805 | 0.0000006 | 0.0000008 |
| Euryarchaeota | 0.0004029 | 0.0004352 | 0.0004526 | 0.0004434 |
| Fibrobacteres | 0.0000005 | 0.0000211 | 0.0000006 | 0.0000008 |
| Firmicutes | 0.0039582 | 0.0039924 | 0.0036522 | 0.0041295 |
| Fusobacteria | 0.0001832 | 0.0002426 | 0.0001629 | 0.0000441 |
| Gemmatimonadetes | 0.0000005 | 0.0000207 | 0.0000223 | 0.0000008 |
| Lentisphaerae | 0.0001383 | 0.0001333 | 0.0003590 | 0.0000166 |
| Nitrospinae | 0.0000292 | 0.0000557 | 0.0001079 | 0.0001370 |
| Nitrospirae | 0.0000292 | 0.0000829 | 0.0000454 | 0.0000582 |
| Planctomycetes | 0.0049462 | 0.0052827 | 0.0039829 | 0.0051175 |
| Proteobacteria | 0.2734323 | 0.2614350 | 0.2589273 | 0.2532678 |
| Rhodothermaeota | 0.0001171 | 0.0001941 | 0.0002318 | 0.0000412 |
| Spirochaetes | 0.0003928 | 0.0004973 | 0.0004043 | 0.0008249 |
| Tenericutes | 0.0002031 | 0.0000010 | 0.0003262 | 0.0000016 |
| Thaumarchaeota | 0.0006138 | 0.0006689 | 0.0005068 | 0.0006472 |
| Verrucomicrobia | 0.0025871 | 0.0022482 | 0.0019091 | 0.0018836 |
# Before plotting the bar plots, I first need to prepare my objects
counts_collapsed_genera <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at Genus (?) level
# taxa.barplots_top20_genera.sum_1 <- ddply(taxa.barplots_top20_genera, "Rank7", numcolwise(sum))
counts_collapsed_genera.sum <- ddply(counts_collapsed_genera, "Genus", numcolwise(sum)) %>%
column_to_rownames("Genus")
# ------------------------------------------- #
# Now finding the top 20 most abundant genera #
# ------------------------------------------- #
# Let's calculate the average Genus counts across all samples (Genus is in rows)
counts_collapsed_genera.sum$avg_value <- rowMeans(counts_collapsed_genera.sum)
# Order rows based on average value in descending order
counts_collapsed_genera.sum <- counts_collapsed_genera.sum[order(counts_collapsed_genera.sum$avg_value, decreasing = TRUE), ]
# Remove the avg_value column now - I don't need it anymore
counts_collapsed_genera.sum$avg_value <- NULL
# Now move the Unknown Genus row at the end
row_index <- which(rownames(counts_collapsed_genera.sum) == "NA")
# Move the identified row to the end of the data frame
counts_collapsed_genera.sum <- rbind(counts_collapsed_genera.sum[-row_index, ],
counts_collapsed_genera.sum[row_index, ]
)
# Now collapsing all values bellow the top 20 most abundant genera into "Other"
counts_collapsed_genera.sum.Others <- counts_collapsed_genera.sum
counts_collapsed_genera.sum.Others_top <- slice(counts_collapsed_genera.sum.Others, 1:20)
counts_collapsed_genera.sum.Others_bottom <- slice(counts_collapsed_genera.sum.Others, 21:n())
# Summarise rows from the 21st onwards into one summary row
summary_row <- colSums(counts_collapsed_genera.sum.Others_bottom[, sapply(counts_collapsed_genera.sum.Others_bottom, is.numeric)]) %>%
as.data.frame() %>%
t()
rownames(summary_row) <- "Other"
# Combine the top 20 rows with the summary row containing all other taxa (including those with "Unknown Genus")
counts_collapsed_genera.sum_21 <- rbind(counts_collapsed_genera.sum.Others_top, summary_row)
# Ready to compute raw abundances per sample - by dividing cell value with column sum
taxa.barplots_top20_genera.sum_RA <- counts_collapsed_genera.sum_21
for (i in 1:(ncol(counts_collapsed_genera.sum_21))) {
taxa.barplots_top20_genera.sum_RA[i] <- taxa.barplots_top20_genera.sum_RA[i] / sum(taxa.barplots_top20_genera.sum_RA[i])
}
taxa.barplots_top20_genera.sum_RA
# Checking that it sums up to 1
colSums(taxa.barplots_top20_genera.sum_RA)
## 11-049-1_S89_R1 11-049-2_S90_R1 11-049-3_S91_R1
## 1 1 1
## 11-049-4_S92_R1 11-162-1_S81_R1 11-162-2_S82_R1
## 1 1 1
## 11-162-3_S83_R1 11-162-4_S84_R1 13-124-1_S9_R1
## 1 1 1
## 13-124-2_S10_R1 13-124-3_S11_R1 13-124-4_S12_R1
## 1 1 1
## 21-550-1_S69_R1 21-550-2_S70_R1 21-550-3_S71_R1
## 1 1 1
## 21-550-4_S72_R1 21-580-1_S57_R1 21-580-2_S58_R1
## 1 1 1
## 21-580-3_S59_R1 21-580-4_S60_R1 22-084-1_S41_R1
## 1 1 1
## 22-084-2_S42_R1 22-084-3_S43_R1 22-084-4_S44_R1
## 1 1 1
## Agincourt1-1_S33_R1 Agincourt1-2_S34_R1 Agincourt1-3_S35_R1
## 1 1 1
## Agincourt1-4_S36_R1 Arlington-1_S37_R1 Arlington-2_S38_R1
## 1 1 1
## Arlington-3_S39_R1 Arlington-4_S40_R1 Boult-1_S25_R1
## 1 1 1
## Boult-2_S26_R1 Boult-3_S27_R1 Boult-4_S28_R1
## 1 1 1
## Broomfield-1_S49_R1 Broomfield-3_S51_R1 Broomfield-4_S52_R1
## 1 1 1
## Broomfield-rpt-2_S115_R1 Centipede-1_S57_R1 Centipede-2_S58_R1
## 1 1 1
## Centipede-3_S59_R1 Centipede-4_S60_R1 Chicken-1_S69_R1
## 1 1 1
## Chicken-2_S70_R1 Chicken-3_S71_R1 Chicken-4_S72_R1
## 1 1 1
## Chinaman-1_S65_R1 Chinaman-2_S66_R1 Chinaman-3_S67_R1
## 1 1 1
## Chinaman-4_S68_R1 Corbett-1_S17_R1 Corbett-2_S18_R1
## 1 1 1
## Corbett-3_S19_R1 Corbett-4_S20_R1 Davie-1_S1_R1
## 1 1 1
## Davie-2_S2_R1 Davie-3_S3_R1 Davie-4_S4_R1
## 1 1 1
## Erskine-1_S61_R1 Erskine-2_S62_R1 Erskine-3_S63_R1
## 1 1 1
## Erskine-4_S64_R1 Fairfax-1_S33_R1 Fairfax-2_S34_R1
## 1 1 1
## Fairfax-3_S35_R1 Fairfax-4_S36_R1 Farquaharson-1_S1_R1
## 1 1 1
## Farquaharson-2_S2_R1 Farquaharson-3_S3_R1 Farquaharson-4_S4_R1
## 1 1 1
## Feather-1_S5_R1 Feather-2_S6_R1 Feather-3_S7_R1
## 1 1 1
## Feather-4_S8_R1 Fore-and-Aft-1_S77_R1 Fore-and-Aft-2_S78_R1
## 1 1 1
## Fore-and-Aft-3_S79_R1 Fore-and-Aft-4_S80_R1 Fork-1_S49_R1
## 1 1 1
## Fork-2_S50_R1 Fork-3_S51_R1 Fork-4_S52_R1
## 1 1 1
## Grub-1_S65_R1 Grub-2_S66_R1 Grub-3_S67_R1
## 1 1 1
## Grub-4_S68_R1 Hastings-1_S41_R1 Hastings-2_S42_R1
## 1 1 1
## Hastings-3_S43_R1 Hastings-4_S44_R1 Hedley-1_S21_R1
## 1 1 1
## Hedley-2_S22_R1 Hedley-3_S23_R1 Helix-1_S61_R1
## 1 1 1
## Helix-2_S62_R1 Helix-3_S63_R1 Helix-4_S64_R1
## 1 1 1
## Hoskyn-1_S29_R1 Hoskyn-2_S30_R1 Hoskyn-3_S31_R1
## 1 1 1
## Hoskyn-4_S32_R1 JohnBrewer-1_S93_R1 JohnBrewer-2_S94_R1
## 1 1 1
## JohnBrewer-3_S97_R1 JohnBrewer-4_S98_R1 Kelso-1_S85_R1
## 1 1 1
## Kelso-2_S86_R1 Kelso-3_S87_R1 Kelso-4_S88_R1
## 1 1 1
## Knife-1_S45_R1 Knife-2_S46_R1 Knife-3_S47_R1
## 1 1 1
## Knife-4_S48_R1 Lagoon-1_S13_R1 Lagoon-2_S14_R1
## 1 1 1
## Lagoon-3_S15_R1 Lagoon-4_S16_R1 LittleKelso-1_S81_R1
## 1 1 1
## LittleKelso-2_S82_R1 LittleKelso-3_S83_R1 LittleKelso-4_S84_R1
## 1 1 1
## Lynchs-1_S99_R1 Lynchs-2_S100_R1 Lynchs-3_S101_R1
## 1 1 1
## Lynchs-4_S102_R1 Mantis-1_S85_R1 Mantis-2_S86_R1
## 1 1 1
## Mantis-3_S87_R1 Mantis-4_S88_R1 Masthead-1_S53_R1
## 1 1 1
## Masthead-2_S54_R1 Masthead-3_S55_R1 Masthead-4_S56_R1
## 1 1 1
## McCulloch-1_S17_R1 McCulloch-2_S18_R1 McCulloch-3_S19_R1
## 1 1 1
## McCulloch-4_S20_R1 McSweeney-1_S5_R1 McSweeney-2_S6_R1
## 1 1 1
## McSweeney-3_S7_R1 McSweeney-4_S8_R1 Monsoon-1_S21_R1
## 1 1 1
## Monsoon-2_S22_R1 Monsoon-3_S23_R1 Monsoon-4_S24_R1
## 1 1 1
## Moore-1_S25_R1 Moore-2_S26_R1 Moore-3_S27_R1
## 1 1 1
## Moore-4_S28_R1 Myrmidon-1_S53_R1 Myrmidon-2_S54_R1
## 1 1 1
## Myrmidon-3_S55_R1 Myrmidon-4_S56_R1 North-1_S37_R1
## 1 1 1
## North-2_S38_R1 North-3_S39_R1 North-4_S40_R1
## 1 1 1
## Peart-1_S13_R1 Peart-2_S14_R1 Peart-3_S15_R1
## 1 1 1
## Peart-4_S16_R1 Rib-1_S73_R1 Rib-2_S74_R1
## 1 1 1
## Rib-3_S75_R1 Rib-4_S76_R1 Roxburgh-1_S89_R1
## 1 1 1
## Roxburgh-2_S90_R1 Roxburgh-3_S91_R1 Roxburgh-4_S92_R1
## 1 1 1
## Sanbank1-1_S77_R1 Sanbank1-2_S78_R1 Sanbank1-3_S79_R1
## 1 1 1
## Sanbank1-4_S80_R1 SmallLagoon-1_S45_R1 SmallLagoon-2_S46_R1
## 1 1 1
## SmallLagoon-3_S47_R1 SmallLagoon-4_S48_R1 St-Crispin-1_S73_R1
## 1 1 1
## St-Crispin-2_S74_R1 St-Crispin-3_S75_R1 St-Crispin-4_S76_R1
## 1 1 1
## Taylor-1_S9_R1 Taylor-2_S10_R1 Taylor-3_S11_R1
## 1 1 1
## Taylor-4_S12_R1 Thetford-1_S29_R1 Thetford-2_S30_R1
## 1 1 1
## Thetford-3_S31_R1 Thetford-4_S32_R1
## 1 1
# Now setting row names - RA
taxa.barplots_top20_genera_transposed_RA <- t(taxa.barplots_top20_genera.sum_RA)
# Now setting back the col names before melting - RA
taxa.barplots_top20_genera_transposed_RA <- tibble::rownames_to_column(as.data.frame(taxa.barplots_top20_genera_transposed_RA), "Sample_ID")
taxa.barplots_top20_genera_transposed_melt_RA = reshape2::melt(taxa.barplots_top20_genera_transposed_RA, id.vars=c("Sample_ID"))
# At last, we add metadata_barplots info so that I can make facets in the plot - RA
taxa.barplots_top20_genera_transposed_melt_RA <- left_join(taxa.barplots_top20_genera_transposed_melt_RA,
metadata %>%
rownames_to_column("Sample_ID"))
## Joining with `by = join_by(Sample_ID)`
# Setting colors
cols_top20_genera <- c(
"Synechococcus" = "seagreen",
"Candidatus Pelagibacter" = "steelblue4",
"Prochlorococcus" = "palegreen3",
"Candidatus Actinomarina" = "powderblue",
"Candidatus Puniceispirillum" = "seashell2",
"Cyanobium" = "seagreen2",
"Marinovum" = "olivedrab1",
"Luminiphilus" = "maroon",
"Pseudomonas" = "grey45",
"Polaribacter" = "salmon",
"Rhodopirellula" = "tan",
"Flavobacterium" = "indianred4",
"Litoricola" = "turquoise",
"Vibrio" = "indianred3",
"Balneola" = "seagreen3",
"Erythrobacter" = "plum4",
"Pseudoalteromonas" = "slategrey",
"Sulfitobacter" = "plum",
"Nisaea" = "steelblue3",
"Candidatus Endolissoclinum" = "tan2",
"Other" = "snow3"
)
admix_top20_genera_RA=ggplot(data=taxa.barplots_top20_genera_transposed_melt_RA,
aes(x=Sample_ID,
y=value,
fill=variable))+
geom_bar(stat = "identity")+
scale_y_continuous(expand = c(0,0))+
facet_wrap(~Sampling_trip, scales = "free", nrow = 4)+
# facet_wrap(~Sampling_trip, scales = "free", ncol = 1, nrow = 4)+
# facet_grid(~Sampling_trip, scales = "free_x", space = "free")+
scale_fill_manual(values = cols_top20_genera)+
ylab("Relative abundances of taxa (at 'Genus' level)")+
xlab("Reef sites")+
theme(axis.text.x = element_text(angle = 75, hjust = 1, size = 12),
#axis.ticks.x = element_blank(),
#axis.title.x = element_blank(),
strip.text = element_text(colour="black", size=12),
panel.grid = element_blank(),
panel.background = element_blank(),
legend.position = "right",
legend.title = element_blank(),
legend.text = element_text(size = 12))
admix_top20_genera_RA
# Group by mean using R Base aggregate()
genera_mean <- aggregate(taxa.barplots_top20_genera_transposed_melt_RA$value, by=list(taxa.barplots_top20_genera_transposed_melt_RA$variable), FUN=mean)
# Checking that the row sums will be 1
sum(genera_mean$x)
## [1] 1
# It worked
# Printing as table now, by sorting the values too.
knitr::kable(arrange(genera_mean, desc(x)), caption = "Mean relative abundances at Genus level, across all samples. We only show the balue for the top 20 most abundant genera, and values for all others are collapsed within the category Other, shown in grey on the barplots.")
| Group.1 | x |
|---|---|
| Synechococcus | 0.5498770 |
| Candidatus Pelagibacter | 0.1589150 |
| Prochlorococcus | 0.1193653 |
| Other | 0.1012086 |
| Candidatus Actinomarina | 0.0120422 |
| Candidatus Puniceispirillum | 0.0090199 |
| Marinovum | 0.0074967 |
| Cyanobium | 0.0072463 |
| Luminiphilus | 0.0053392 |
| Pseudomonas | 0.0050729 |
| Polaribacter | 0.0048884 |
| Rhodopirellula | 0.0029601 |
| Flavobacterium | 0.0025491 |
| Vibrio | 0.0020786 |
| Litoricola | 0.0020743 |
| Pseudoalteromonas | 0.0018675 |
| Balneola | 0.0018007 |
| Erythrobacter | 0.0016710 |
| Sulfitobacter | 0.0015728 |
| Nisaea | 0.0014830 |
| Candidatus Endolissoclinum | 0.0014713 |
# Group by mean using R Base aggregate()
genera_mean_per_trip <- aggregate(taxa.barplots_top20_genera_transposed_melt_RA$value, by=list(taxa.barplots_top20_genera_transposed_melt_RA$variable, taxa.barplots_top20_genera_transposed_melt_RA$Sampling_trip), FUN=mean)
# Checking that the row sums will be 1
sum(genera_mean_per_trip$x)
## [1] 4
# It worked
knitr::kable(dcast(genera_mean_per_trip, Group.1 ~ Group.2, value.var = "x"), caption = "Mean relative abundances at Genus level, partitioned per trip.")
| Group.1 | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | Trip_03_February_2020 | Trip_04_July_2020 |
|---|---|---|---|---|
| Synechococcus | 0.5949829 | 0.6450816 | 0.6313425 | 0.3702787 |
| Candidatus Pelagibacter | 0.1624467 | 0.1490722 | 0.1555976 | 0.1671242 |
| Prochlorococcus | 0.0581456 | 0.0241497 | 0.0147776 | 0.3293883 |
| Candidatus Actinomarina | 0.0164315 | 0.0165645 | 0.0093994 | 0.0067467 |
| Candidatus Puniceispirillum | 0.0076347 | 0.0093176 | 0.0108462 | 0.0084509 |
| Cyanobium | 0.0083558 | 0.0082732 | 0.0079558 | 0.0049497 |
| Marinovum | 0.0111624 | 0.0079801 | 0.0057018 | 0.0055803 |
| Luminiphilus | 0.0070972 | 0.0052074 | 0.0066333 | 0.0030773 |
| Pseudomonas | 0.0057212 | 0.0056568 | 0.0055251 | 0.0037157 |
| Polaribacter | 0.0007300 | 0.0005458 | 0.0199170 | 0.0003381 |
| Rhodopirellula | 0.0033751 | 0.0031655 | 0.0015981 | 0.0035039 |
| Flavobacterium | 0.0024805 | 0.0018970 | 0.0044455 | 0.0017056 |
| Litoricola | 0.0044207 | 0.0028318 | 0.0015289 | 0.0000004 |
| Vibrio | 0.0024719 | 0.0025171 | 0.0016892 | 0.0016926 |
| Balneola | 0.0029779 | 0.0017469 | 0.0022828 | 0.0005516 |
| Erythrobacter | 0.0013159 | 0.0032885 | 0.0015133 | 0.0006846 |
| Pseudoalteromonas | 0.0010396 | 0.0016676 | 0.0006369 | 0.0036343 |
| Sulfitobacter | 0.0018072 | 0.0019539 | 0.0018984 | 0.0008121 |
| Nisaea | 0.0023232 | 0.0005941 | 0.0030482 | 0.0003829 |
| Candidatus Endolissoclinum | 0.0014874 | 0.0015088 | 0.0016374 | 0.0012991 |
| Other | 0.1035927 | 0.1069799 | 0.1120250 | 0.0860830 |
# Subsetting my phyloseq object - I only want Bacteroidetes!
megan_Bacteroidetes <- subset_taxa(megan_genus_abundant, # Phyloseq object with all OTUs
Phylum=="Bacteroidetes") # The phyloseq object with raw counts
# Shannon diversity
megan_genus_shannon_Bacteroidetes <- diversity(t(otu_table(megan_Bacteroidetes)), index = "shannon")
# Preparing the objects to visualise as boxplots
# Shannon
megan_genus_shannon_boxplots_Bacteroidetes <- as.data.frame(megan_genus_shannon_Bacteroidetes)
megan_genus_shannon_boxplots_Bacteroidetes <- tibble::rownames_to_column(megan_genus_shannon_boxplots_Bacteroidetes, "Sample_ID")
megan_genus_shannon_boxplots_Bacteroidetes <- left_join(megan_genus_shannon_boxplots_Bacteroidetes,
metadata %>%
rownames_to_column("Sample_ID"))
# Getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
Shannon_numerical_summ_Bacteroidetes <- ddply(megan_genus_shannon_boxplots_Bacteroidetes,
.(Sampling_trip),
summarize,
med = median(megan_genus_shannon_Bacteroidetes),
SD = sd(megan_genus_shannon_Bacteroidetes))
# Plotting
ggplot(megan_genus_shannon_boxplots_Bacteroidetes, aes(y = megan_genus_shannon_Bacteroidetes, x = Sampling_trip)) + # Shannon index
geom_boxplot(aes(fill=factor(Sampling_trip)), outlier.shape = NA) +
geom_jitter(size=1.2, alpha=0.5) + # adding actual data points
geom_text(data = Shannon_numerical_summ_Bacteroidetes, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
geom_text(data = Shannon_numerical_summ_Bacteroidetes, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text
scale_fill_manual(values=c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue")) + # Sampling trip 4
coord_flip()+ # just flipping the plot
labs(title="Alpha diversity - within Bacteroidetes",
subtitle="Across sampling trips",
x="Sampling events",
y="Shannon index")+
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12)) +
theme_bw() +
ylim(1,5) +
stat_pvalue_manual(megan_genus_shannon_boxplots_Bacteroidetes %>%
pairwise_wilcox_test(megan_genus_shannon_Bacteroidetes ~ Sampling_trip) %>%
add_xy_position())
Bacteroidetes Shannon index
# I will modify this manually in Inkscape
| Sampling_trip | med | SD |
|---|---|---|
| Trip_01_Nov-Dec_2019 | 2.696195 | 0.4270093 |
| Trip_02_January_2020 | 2.642752 | 0.3609917 |
| Trip_03_February_2020 | 2.496523 | 0.6229173 |
| Trip_04_July_2020 | 2.237772 | 0.2510951 |
# Numerical output
knitr::kable(megan_genus_shannon_boxplots_Bacteroidetes %>%
pairwise_wilcox_test(megan_genus_shannon_Bacteroidetes ~ Sampling_trip),
caption = "Wilcoxon rank sum test to compare median Shanon Diversity between trips, computed within Bacteroidetes.")
| .y. | group1 | group2 | n1 | n2 | statistic | p | p.adj | p.adj.signif |
|---|---|---|---|---|---|---|---|---|
| megan_genus_shannon_Bacteroidetes | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | 44 | 48 | 1227 | 1.84e-01 | 3.68e-01 | ns |
| megan_genus_shannon_Bacteroidetes | Trip_01_Nov-Dec_2019 | Trip_03_February_2020 | 44 | 43 | 1145 | 9.20e-02 | 2.76e-01 | ns |
| megan_genus_shannon_Bacteroidetes | Trip_01_Nov-Dec_2019 | Trip_04_July_2020 | 44 | 56 | 2049 | 0.00e+00 | 1.00e-07 | **** |
| megan_genus_shannon_Bacteroidetes | Trip_02_January_2020 | Trip_03_February_2020 | 48 | 43 | 1146 | 3.69e-01 | 3.69e-01 | ns |
| megan_genus_shannon_Bacteroidetes | Trip_02_January_2020 | Trip_04_July_2020 | 48 | 56 | 2161 | 1.00e-07 | 5.00e-07 | **** |
| megan_genus_shannon_Bacteroidetes | Trip_03_February_2020 | Trip_04_July_2020 | 43 | 56 | 1519 | 2.60e-02 | 1.06e-01 | ns |
Does diversity differ across trips when computed on overall communities?
# Shannon diversity
megan_genus_shannon <- diversity(t(otu_table(megan_genus_abundant)), index = "shannon")
# Preparing the objects to visualise as boxplots
# Shannon
megan_genus_shannon_boxplots <- as.data.frame(megan_genus_shannon)
megan_genus_shannon_boxplots <- tibble::rownames_to_column(megan_genus_shannon_boxplots, "Sample_ID")
megan_genus_shannon_boxplots <- left_join(megan_genus_shannon_boxplots,
metadata %>% rownames_to_column("Sample_ID"))
# Getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
Shannon_numerical_summ <- ddply(megan_genus_shannon_boxplots,
.(Sampling_trip),
summarize,
med = median(megan_genus_shannon),
SD = sd(megan_genus_shannon))
# Plotting
ggplot(megan_genus_shannon_boxplots, aes(y = megan_genus_shannon, x = Sampling_trip)) + # Shannon index
geom_boxplot(aes(fill=factor(Sampling_trip)), outlier.shape = NA) +
geom_jitter(size=1.2, alpha=0.5) + # adding actual data points
geom_text(data = Shannon_numerical_summ, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
geom_text(data = Shannon_numerical_summ, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text
scale_fill_manual(values=c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue")) + # Sampling trip 4
coord_flip()+ # just flipping the plot
labs(title="Alpha diversity - overall microbial communities",
subtitle="Across sampling trips",
x="Sampling events",
y="Shannon index")+
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12)) +
theme_bw() +
stat_pvalue_manual(megan_genus_shannon_boxplots %>%
pairwise_wilcox_test(megan_genus_shannon ~ Sampling_trip) %>%
add_xy_position())
Bacteroidetes Shannon index
# I will modify this manually in Inkscape
knitr::kable(Shannon_numerical_summ, caption = "Median and standard deviation for Shannon Index values, computed within trips.")
| Sampling_trip | med | SD |
|---|---|---|
| Trip_01_Nov-Dec_2019 | 1.800334 | 0.4372334 |
| Trip_02_January_2020 | 1.576837 | 0.5217831 |
| Trip_03_February_2020 | 1.803015 | 0.4967308 |
| Trip_04_July_2020 | 1.735373 | 0.2994645 |
# Numerical output
knitr::kable(megan_genus_shannon_boxplots %>%
pairwise_wilcox_test(megan_genus_shannon ~ Sampling_trip),
caption = "Wilcoxon rank sum test to compare median Shanon Diversity between trips, computed for overall communities.")
| .y. | group1 | group2 | n1 | n2 | statistic | p | p.adj | p.adj.signif |
|---|---|---|---|---|---|---|---|---|
| megan_genus_shannon | Trip_01_Nov-Dec_2019 | Trip_02_January_2020 | 44 | 48 | 1256 | 0.119 | 0.714 | ns |
| megan_genus_shannon | Trip_01_Nov-Dec_2019 | Trip_03_February_2020 | 44 | 43 | 954 | 0.949 | 1.000 | ns |
| megan_genus_shannon | Trip_01_Nov-Dec_2019 | Trip_04_July_2020 | 44 | 56 | 1379 | 0.309 | 1.000 | ns |
| megan_genus_shannon | Trip_02_January_2020 | Trip_03_February_2020 | 48 | 43 | 911 | 0.340 | 1.000 | ns |
| megan_genus_shannon | Trip_02_January_2020 | Trip_04_July_2020 | 48 | 56 | 1197 | 0.339 | 1.000 | ns |
| megan_genus_shannon | Trip_03_February_2020 | Trip_04_July_2020 | 43 | 56 | 1378 | 0.221 | 1.000 | ns |
go.anosim <- left_join(otu_table(megan_GO_5_RA_no_rare) %>%
as.data.frame %>%
rownames_to_column("OTU"),
megan_GO_5_FUN %>%
rownames_to_column("OTU")) %>%
unite(taxonomy, c(OTU, Rank1, Rank2, Rank3, Rank4, Rank5, Rank6#, Rank7, Rank8
), sep = "; ") %>%
column_to_rownames("taxonomy")
# Removing rows with NAs, because ANOSIM does not take in missing vals
go.anosim <- na.omit(go.anosim)
# Object is ready to perform the test
ano_go <- anosim(t(go.anosim),
sample_data(megan_GO_5_RA_no_rare)$Sampling_trip,
distance = "bray",
permutations = 9999)
# Results
ano_go
##
## Call:
## anosim(x = t(go.anosim), grouping = sample_data(megan_GO_5_RA_no_rare)$Sampling_trip, permutations = 9999, distance = "bray")
## Dissimilarity: bray
##
## ANOSIM statistic R: 0.3743
## Significance: 1e-04
##
## Permutation: free
## Number of permutations: 9999
Pairwise PERMANOVA - GO terms (rank 5)
# Compute the mantel tests - cite the source of where this is coming from!
multimantel<-function(distance,env.df,geo.dist){
BCdist<-distance
statistic<-NULL
pval<-NULL
n.obs<-NULL
for (i in 1:ncol(env.df)){
na.pos<-which(is.na(env.df[,i]))
if (length(na.pos)>0) tmp<-mantel.partial(as.dist(as.matrix(BCdist)[-c(na.pos),-c(na.pos)]),dist(env.df[-c(na.pos),i]),as.dist(as.matrix(geo.dist)[-c(na.pos),-c(na.pos)]),method = "pearson",permutations = 1000) else tmp<-mantel.partial(BCdist,dist(env.df[,i]),geo.dist,method = "pearson",permutations = 1000)
statistic<-c(statistic,tmp$statistic)
pval<-c(pval,tmp$signif)
n.obs<-c(n.obs,nrow(env.df)-length(na.pos))
}
data.frame(var=colnames(env.df),statistic,pval,p.corr=p.adjust(pval,method="bonferroni"),n.obs)
}
### Calculate Bray-Curtis dissimilarities - doing this on the Relative abundance data when rare taxa were excluded
# Taxonomy
megan_genus_dist <- vegdist(t(otu_table(megan_genus_RA_no_rare)), method = "bray")
# GO terms
megan_go_dist <- vegdist(t(otu_table(megan_GO_5_RA_no_rare)), method = "bray")
# Getting distances (in km) for IMOS-MGD sites - this is important because the Mantels will be corrected for geography
# Getting distances (in km) for IMOS-MGD sites
metadata_Mantel <- sample_data(megan_genus_clr) %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column("Sample_ID")
# Importing the coordinates
map_coords_Mantel <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/MARKO_for_eReefs_Lats_Longs.csv")
map_coords_Mantel <- left_join(metadata_Mantel[,c(1,2)], map_coords_Mantel, by = c("REEF_NAME" = "name"))
# map_coords <- map_coords %>% remove_rownames %>% column_to_rownames(var="Sample_ID")
map_coords_Mantel$REEF_NAME <- NULL
names(map_coords_Mantel)[names(map_coords_Mantel) == 'Sample_ID'] <- 'name'
# Setting first column as row names
map_coords_Mantel <- map_coords_Mantel %>%
remove_rownames %>%
column_to_rownames(var="name")
# Need to reorder as pointDistance() function requires longitude to go first
map_coords_reorder <- map_coords_Mantel %>%
relocate(lon, lat)
# Probably better to compute this 4 times for each of the trips, but first need to make sure that this code works
IMOS_mar.dist.mat <- round(pointDistance(map_coords_reorder, lonlat=TRUE) / 1000)
rownames(IMOS_mar.dist.mat) <- metadata_Mantel$Sample_ID
colnames(IMOS_mar.dist.mat) <- metadata_Mantel$Sample_ID
# Trick here: Now adding one column to the front so that I can make the correlation plot for both midshelf and offshore reefs
metadata_Mantel <- cbind(a = 0, metadata_Mantel)
# partial Mantels - microbial taxa
partial_Mantel_taxa_res <- multimantel(as.dist(as.matrix(megan_genus_dist)[metadata_Mantel$a=="0",metadata_Mantel$a=="0"]), # Distance object, doing it only
# for the epipelagic layer
metadata_Mantel[metadata_Mantel$a=="0", colnames(metadata_Mantel[,c(26:42)])], # columns 26-42 will extract numerical values
as.dist(as.matrix(IMOS_mar.dist.mat)[metadata_Mantel$a=="0", metadata_Mantel$a=="0"])) #[env.mat$epi=="EPI", env.mat$epi=="EPI"])) # I only need the geographic
# distances, in km
knitr::kable(partial_Mantel_taxa_res %>% arrange(abs(statistic)),
caption = "Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the taxonomic level."
)
| var | statistic | pval | p.corr | n.obs |
|---|---|---|---|---|
| FLUORESCENCE_2.5m_RV | 0.0106918 | 0.3536464 | 1.0000000 | 191 |
| median_TDN_µM | -0.0160852 | 0.6793207 | 1.0000000 | 191 |
| median_NO3_µM | 0.0175296 | 0.2907093 | 1.0000000 | 191 |
| median_DOC_µM | -0.0227709 | 0.8021978 | 1.0000000 | 191 |
| median_Si_µM | 0.0237783 | 0.1858142 | 1.0000000 | 191 |
| SALINITY_2.5m_RV | -0.0350001 | 0.8911089 | 1.0000000 | 191 |
| median_TSS_mg_L | -0.0429887 | 0.8241758 | 1.0000000 | 187 |
| median_Chlorophyll_A_µg_L | 0.0573543 | 0.0989011 | 1.0000000 | 191 |
| median_Phaeophytin_A_µg_L | 0.0630440 | 0.0799201 | 1.0000000 | 191 |
| median_NO2_µM | 0.1063563 | 0.0009990 | 0.0169830 | 191 |
| median_NH4_µM | 0.1074423 | 0.0079920 | 0.1358641 | 191 |
| median_PP_µM | 0.1322265 | 0.0019980 | 0.0339660 | 187 |
| median_TDP_µM | 0.2247735 | 0.0009990 | 0.0169830 | 191 |
| median_POC_µM | 0.2335920 | 0.0009990 | 0.0169830 | 191 |
| median_PN_µM | 0.2648581 | 0.0009990 | 0.0169830 | 191 |
| SEAWATER_TEMPERATURE_2.5m_RV | 0.2957969 | 0.0009990 | 0.0169830 | 191 |
| median_PO4_µM | 0.3630517 | 0.0009990 | 0.0169830 | 191 |
# partial Mantels - microbial function (GO terms)
partial_Mantel_GOs_res <- multimantel(as.dist(as.matrix(megan_go_dist)[metadata_Mantel$a=="0",metadata_Mantel$a=="0"]), # Distance object, doing it only
# for the epipelagic layer
metadata_Mantel[metadata_Mantel$a=="0", colnames(metadata_Mantel[,c(26:42)])], # columns 26-42 will extract numerical values
as.dist(as.matrix(IMOS_mar.dist.mat)[metadata_Mantel$a=="0", metadata_Mantel$a=="0"])) #[env.mat$epi=="EPI", env.mat$epi=="EPI"])) # I only need the geographic
# distances, in km
knitr::kable(partial_Mantel_GOs_res %>% arrange(abs(statistic)),
caption = "Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the functional level."
)
| var | statistic | pval | p.corr | n.obs |
|---|---|---|---|---|
| median_Si_µM | 0.0111393 | 0.3076923 | 1.0000000 | 191 |
| SALINITY_2.5m_RV | -0.0130414 | 0.6993007 | 1.0000000 | 191 |
| median_TSS_mg_L | -0.0242762 | 0.7142857 | 1.0000000 | 187 |
| median_TDN_µM | 0.0293237 | 0.1438561 | 1.0000000 | 191 |
| median_NO2_µM | 0.0304584 | 0.1388611 | 1.0000000 | 191 |
| median_NO3_µM | -0.0370669 | 0.8851149 | 1.0000000 | 191 |
| median_NH4_µM | 0.0511959 | 0.0459540 | 0.7812188 | 191 |
| FLUORESCENCE_2.5m_RV | 0.0532699 | 0.0279720 | 0.4755245 | 191 |
| median_Phaeophytin_A_µg_L | 0.0930029 | 0.0069930 | 0.1188811 | 191 |
| median_Chlorophyll_A_µg_L | 0.1126347 | 0.0019980 | 0.0339660 | 191 |
| median_DOC_µM | 0.1157628 | 0.0009990 | 0.0169830 | 191 |
| median_PP_µM | 0.1189701 | 0.0009990 | 0.0169830 | 187 |
| median_PN_µM | 0.2554285 | 0.0009990 | 0.0169830 | 191 |
| median_TDP_µM | 0.2583475 | 0.0009990 | 0.0169830 | 191 |
| median_PO4_µM | 0.2640840 | 0.0009990 | 0.0169830 | 191 |
| median_POC_µM | 0.2755923 | 0.0009990 | 0.0169830 | 191 |
| SEAWATER_TEMPERATURE_2.5m_RV | 0.2990635 | 0.0009990 | 0.0169830 | 191 |
# WQ
partial_Mantel_cor.mat_taxa_WQ <- data.frame(Taxonomy=partial_Mantel_taxa_res$statistic,
# GO_terms=IMOS_res_go_WQ$statistic, # Transcriptome=res.metaT$statistic,
row.names = partial_Mantel_taxa_res$var)
partial_Mantel_pcor.mat_taxa_WQ <- data.frame(Taxonomy=partial_Mantel_taxa_res$pval,
# GO_terms=IMOS_res_go_WQ$p.corr, # Expression=res.exp$pval,
row.names = partial_Mantel_taxa_res$var)# ,Transcriptome=res.metaT$pval)
# Ordering - highest correlations first
# WQ_ordre<-order(apply(IMOS_cor.mat_WQ[,1:2],1,mean),decreasing = T)
# WQ
partial_Mantel_cor.mat_GOs_WQ <- data.frame(Functions=partial_Mantel_GOs_res$statistic,
# GO_terms=IMOS_res_go_WQ$statistic, # Transcriptome=res.metaT$statistic,
row.names = partial_Mantel_GOs_res$var)
partial_Mantel_pcor.mat_GOs_WQ <- data.frame(Functions=partial_Mantel_GOs_res$pval,
# GO_terms=IMOS_res_go_WQ$p.corr, # Expression=res.exp$pval,
row.names = partial_Mantel_GOs_res$var)# ,Transcriptome=res.metaT$pval)
# Ordering - highest correlations first
# WQ_ordre<-order(apply(IMOS_cor.mat_WQ[,1:2],1,mean),decreasing = T)
# Let's visualise this! as heatmaps:
# Taxonomy
heatmap_partial_Mantels_taxa_WQ <- ggcorrplot(partial_Mantel_cor.mat_taxa_WQ,#[ordre,], # Strongest drivers first
p.mat=partial_Mantel_pcor.mat_taxa_WQ,#[ordre,], # Strongest drivers first
insig = "blank",
sig.level = 0.05,
method = "square",
lab=T,
lab_size = 2.5,
colors=c("#2874b2","white","#ba2832"))
heatmap_partial_Mantels_taxa_WQ
# Functions
heatmap_partial_Mantels_GOs_WQ <- ggcorrplot(partial_Mantel_cor.mat_GOs_WQ,#[ordre,], # Strongest drivers first
p.mat=partial_Mantel_pcor.mat_GOs_WQ,#[ordre,], # Strongest drivers first
insig = "blank",
sig.level = 0.05,
method = "square",
lab=T,
lab_size = 2.5,
colors=c("#2874b2","white","#ba2832"))
heatmap_partial_Mantels_GOs_WQ
# Merging the two
# patchwork::wrap_plots(heatmap_partial_Mantels_taxa_WQ,
# heatmap_partial_Mantels_GOs_WQ,
# nrow = 2,
# ncol = 1)
To (1) identify stable microbial indicators—both taxonomic and functional—that consistently respond to specific physico-chemical variables (e.g., nutrient loads, temperature, salinity) across broad spatio-temporal scales in the GBRwe extended a Sparse Partial Least Squares analysis (sPLS, see Lê Cao et al. 2008, 2009) widely used in microbial oceanography to correlate microbial data with continuous environmental metrics (see e.g. Guidi et al. 2016; Jameson et al. 2023; Priest et al. 2023) with a Multivariate INTegrative method (MINT, see Rohart et al., 2017a) to integrate data from four independent sampling trips We also attempted MINT (Multivariate INTegration, Rohart et al. (2017b)), a method based on multi-group PLS that includes information about samples belonging to independent groups or studies (Eslami et al., 2014). In this context, the challenge was to accommodate for confounding effects between season and geography as each site was sampled only once in time and space. By using MINT sPLS, we aimed to identify microbial indicator taxa and genes that correlate to water chemistry metrics and are shared across the four sampling transects, regardless of geography or season. Similar to sPLS, in MINT sPLS we retained two dimensions and 50 features (microbial taxa or genes) per dimension for the X datasets, and all WQ metrics for the Y dataset.
But the MINT sPLS sample plot and circle correlation plots can be combined into a biplot, which will present both types of information.
Bellow is the code from Kim-Anh to create a MINT sPLS biplot:
# create MINT sPLS object
data(stemcells)
# for the purpose of this example, we artificially
# create a continuous response Y by taking genes 1:10.
X = stemcells$gene[,-c(1:10)]
# renaming columns here so that I can identify the X and Y datasets
colnames(X) = paste('X', 1:ncol(X), sep = '.')
Y = stemcells$gene[,1:10]
# renaming columns here so that I can identify the X and Y datasets
colnames(Y) = paste('Y', 1:ncol(Y), sep = '.')
# here selecting only on X
res = mint.spls(X = X, Y = Y, ncomp = 2,
keepX = c(10, 5), study = stemcells$study)
plotIndiv(res) # symbol represent study
plotVar(res)
library(ggrepel)
# INPUT ARGUMENTS
col = res$study # color of samples according to the study
pch = res$study # pch of samples according to the study
var.names.col = 'grey40'
var.names.size = 4
var.arrow.col.X = 'lightblue' # color of arrow + name for X data set (could be set as a vector length the number of variables selected in X)
var.arrow.col.Y = 'orange'
var.arrow.size = 0.5
var.arrow.length = 0.2
# components to be plotted
comp1 = 1
comp2 = 2
# input the MINT res object
object <- res
comp <- object$ncomp
## --- code starts here --------
# identify variables selected
selection.X <- rowSums(object$loadings$X[, 1:comp]) != 0
selection.Y <- rowSums(object$loadings$Y[, 1:comp]) != 0
loadings.X <- data.frame(object$loadings$X[selection.X, ])
loadings.Y <- data.frame(object$loadings$Y[selection.Y, ])
# if cutoff for the correlation circle plot (not used here, if you do I think things will break!)
cutoff <- 0
cors.X <- cor(object$X[, selection.X], object$variates$X[, 1:comp], use = 'pairwise' )
cors.Y <- cor(object$Y[, selection.Y], object$variates$Y[, 1:comp], use = 'pairwise' )
above.cutoff.X <- apply(cors.X, 1, function(x) any(abs(x) >= cutoff))
above.cutoff.Y <- apply(cors.Y, 1, function(x) any(abs(x) >= cutoff))
loadings.X <- loadings.X[above.cutoff.X,]
loadings.Y <- loadings.Y[above.cutoff.Y,]
# only representing the samples in the X space
variates <- object$variates$X
variates <- data.frame(variates)
## scaler of var vs sample coordinates
scaler <- max(variates, na.rm = TRUE)/max(abs(cors.X), na.rm = TRUE)
## potentially need to extend this for the Y scaler??
axes.titles <- c('Comp 1', 'Comp 2')
## ------------- outline of plot -----
gg_biplot <-
ggplot() +
theme_classic() +
labs(x = axes.titles[1],
y = axes.titles[2])
# ## vline and hline - you may want to comment this if you dont want these lines
gg_biplot <- gg_biplot + geom_vline(xintercept = 0, size = 0.3, col = 'grey75')
gg_biplot <- gg_biplot + geom_hline(yintercept = 0, size = 0.3, col = 'grey75')
gg_biplot
# ------
# PLOT SAMPLES
gg_biplot <- gg_biplot +
geom_point(aes(x = variates[, comp1],
y = variates[, comp2], col = col, shape = pch),
size = 2,
show.legend = FALSE)
gg_biplot
# PLOT VARIABLES
# the correlations are rescaled - need to fiddle a bit here
cors.X <- cors.X*scaler*0.8
cors.Y <- cors.Y*scaler*0.8
## lines and arrows
# X variables
gg_biplot <-
gg_biplot + geom_segment(
aes(
x = 0,
y = 0,
xend = cors.X[,comp1],
yend = cors.X[,comp2],
),
col = var.arrow.col.X,
arrow = arrow(length = unit(var.arrow.length, "cm")),
size = var.arrow.size,
show.legend = FALSE
)
gg_biplot
# Y variables
gg_biplot <-
gg_biplot + geom_segment(
aes(
x = 0,
y = 0,
xend = cors.Y[,comp1],
yend = cors.Y[,comp2],
),
col = var.arrow.col.Y,
arrow = arrow(length = unit(var.arrow.length, "cm")),
size = var.arrow.size,
show.legend = FALSE
)
gg_biplot
## labels of X variables
var.labels.X <- rownames(loadings.X)
gg_biplot <-
gg_biplot + geom_text_repel(
aes(
x = cors.X[, comp1],
y = cors.X[, comp2],
label = var.labels.X),
col = var.arrow.col.X)
## labels of Y variables
var.labels.Y <- rownames(loadings.Y)
gg_biplot <-
gg_biplot + geom_text_repel(
aes(
x = cors.Y[, comp1],
y = cors.Y[, comp2],
label = var.labels.Y),
col = var.arrow.col.Y)
gg_biplot
Which I applied on our data, first on taxa:
# renaming columns here so that I can identify the X and Y datasets
metadata_MINT_biplot <- sample_data(megan_genus_clr)[,24:40]
# Let's make the names nicer for plotting:
old_names <- c("median_Chlorophyll_A_µg_L", "median_Phaeophytin_A_µg_L",
"median_PN_µM", "median_POC_µM", "median_PP_µM",
"median_DOC_µM", "median_PO4_µM", "median_NH4_µM",
"median_NO2_µM", "median_NO3_µM", "median_Si_µM",
"median_TDN_µM", "median_TDP_µM", "median_TSS_mg_L",
"SEAWATER_TEMPERATURE_2.5m_RV", "SALINITY_2.5m_RV",
"FLUORESCENCE_2.5m_RV")
new_names <- c("Ch-a", "Phaeo", "PN", "POC", "PP",
"DOC", "PO4", "NH4", "NO2", "NO3", "Si", "TDN",
"TDP", "TSS", "SST_2.5m_RV", "SALINITY_2.5m_RV",
"FLUORESCENCE_2.5m_RV")
# Find indices of old names in current column names
indices <- match(old_names, colnames(metadata_MINT_biplot))
# Replace old names with new names
colnames(metadata_MINT_biplot)[indices] <- new_names
# here selecting only on X
res = mint.spls(X = OTUs_biplot_names,
Y = metadata_MINT_biplot,
ncomp = 2,
keepX = c(10, 10),
study = sample_data(megan_genus_clr)$Sampling_trip)
plotIndiv(res)
plotIndiv(res,
group = res$study,
# title = 'global MINT sPLS | Microbial Taxonomy-WQ',
legend = T,
rep.space = "XY-variate",
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
# legend.title = 'Sampling Trip'
) # symbol represent study
plotVar(res)
library(ggrepel)
# INPUT ARGUMENTS
col = res$study # color of samples according to the study
pch = res$study # pch of samples according to the study
var.names.col = 'grey40'
var.names.size = 4
var.arrow.col.X = 'lightblue' # color of arrow + name for X data set (could be set as a vector length the number of variables selected in X)
var.arrow.col.Y = 'orange'
var.arrow.size = 0.5
var.arrow.length = 0.2
# components to be plotted
comp1 = 1
comp2 = 2
# input the MINT res object
object <- res
comp <- object$ncomp
## --- code starts here --------
# identify variables selected
selection.X <- rowSums(object$loadings$X[, 1:comp]) != 0
selection.Y <- rowSums(object$loadings$Y[, 1:comp]) != 0
loadings.X <- data.frame(object$loadings$X[selection.X, ])
loadings.Y <- data.frame(object$loadings$Y[selection.Y, ])
# if cutoff for the correlation circle plot (not used here, if you do I think things will break!)
cutoff <- 0
cors.X <- cor(object$X[, selection.X], object$variates$X[, 1:comp], use = 'pairwise' )
cors.Y <- cor(object$Y[, selection.Y], object$variates$Y[, 1:comp], use = 'pairwise' )
above.cutoff.X <- apply(cors.X, 1, function(x) any(abs(x) >= cutoff))
above.cutoff.Y <- apply(cors.Y, 1, function(x) any(abs(x) >= cutoff))
loadings.X <- loadings.X[above.cutoff.X,]
loadings.Y <- loadings.Y[above.cutoff.Y,]
# only representing the samples in the X space
variates <- object$variates$X
variates <- data.frame(variates)
## scaler of var vs sample coordinates - added one for each data set
scaler.X <- max(object$variates$X, na.rm = TRUE)/max(abs(cors.X), na.rm = TRUE)
scaler.Y <- max(object$variates$Y, na.rm = TRUE)/max(abs(cors.Y), na.rm = TRUE)
axes.titles <- c('Comp 1', 'Comp 2')
## ------------- outline of plot -----
gg_biplot <-
ggplot() +
theme_classic() +
labs(x = axes.titles[1],
y = axes.titles[2])
# ## vline and hline - you may want to comment this if you dont want these lines
gg_biplot <- gg_biplot + geom_vline(xintercept = 0, size = 0.3, col = 'grey75')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gg_biplot <- gg_biplot + geom_hline(yintercept = 0, size = 0.3, col = 'grey75')
gg_biplot
# ------
# PLOT SAMPLES
gg_biplot <- gg_biplot +
geom_point(aes(x = variates[, comp1],
y = variates[, comp2], col = col, shape = pch),
size = 2,
show.legend = FALSE) +
scale_color_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
)
gg_biplot
# PLOT VARIABLES
# the correlations are rescaled - need to fiddle a bit here
cors.X <- cors.X*scaler.X*0.7
cors.Y <- cors.Y*scaler.Y*0.8
## lines and arrows
# X variables
gg_biplot <-
gg_biplot + geom_segment(
aes(
x = 0,
y = 0,
xend = cors.X[,comp1],
yend = cors.X[,comp2],
),
col = var.arrow.col.X,
arrow = arrow(length = unit(var.arrow.length, "cm")),
size = var.arrow.size,
show.legend = FALSE
)
gg_biplot
# Y variables
gg_biplot <-
gg_biplot + geom_segment(
aes(
x = 0,
y = 0,
xend = cors.Y[,comp1],
yend = cors.Y[,comp2],
),
col = var.arrow.col.Y,
arrow = arrow(length = unit(var.arrow.length, "cm")),
size = var.arrow.size,
show.legend = FALSE
)
gg_biplot
## labels of X variables
var.labels.X <- rownames(loadings.X)
gg_biplot <-
gg_biplot + geom_text_repel(
aes(
x = cors.X[, comp1],
y = cors.X[, comp2],
label = var.labels.X),
col = var.arrow.col.X)
## labels of Y variables
var.labels.Y <- rownames(loadings.Y)
gg_biplot <-
gg_biplot + geom_text_repel(
aes(
x = cors.Y[, comp1],
y = cors.Y[, comp2],
label = var.labels.Y),
col = var.arrow.col.Y)
MINT_sPLS_taxa_biplot <- gg_biplot
MINT_sPLS_taxa_biplot
And then also on functions:
# Plotting immediately here, all my objects are prepared already:
res = mint.spls(X = GOs_biplot_names,
Y = metadata_MINT_biplot,
ncomp = 2,
keepX = c(10, 10),
study = sample_data(megan_go_clr_5)$Sampling_trip)
plotIndiv(res)
plotIndiv(res,
group = res$study,
# title = 'global MINT sPLS | Microbial Function-WQ',
legend = T,
rep.space = "XY-variate",
col.per.group =c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue"), # Sampling trip 4
# legend.title = 'Sampling Trip'
) # symbol represent study
plotVar(res)
library(ggrepel)
# INPUT ARGUMENTS
col = res$study # color of samples according to the study
pch = res$study # pch of samples according to the study
var.names.col = 'grey40'
var.names.size = 4
var.arrow.col.X = 'lightblue' # color of arrow + name for X data set (could be set as a vector length the number of variables selected in X)
var.arrow.col.Y = 'orange'
var.arrow.size = 0.5
var.arrow.length = 0.2
# components to be plotted
comp1 = 1
comp2 = 2
# input the MINT res object
object <- res
comp <- object$ncomp
## --- code starts here --------
# identify variables selected
selection.X <- rowSums(object$loadings$X[, 1:comp]) != 0
selection.Y <- rowSums(object$loadings$Y[, 1:comp]) != 0
loadings.X <- data.frame(object$loadings$X[selection.X, ])
loadings.Y <- data.frame(object$loadings$Y[selection.Y, ])
# if cutoff for the correlation circle plot (not used here, if you do I think things will break!)
cutoff <- 0
cors.X <- cor(object$X[, selection.X], object$variates$X[, 1:comp], use = 'pairwise' )
cors.Y <- cor(object$Y[, selection.Y], object$variates$Y[, 1:comp], use = 'pairwise' )
above.cutoff.X <- apply(cors.X, 1, function(x) any(abs(x) >= cutoff))
above.cutoff.Y <- apply(cors.Y, 1, function(x) any(abs(x) >= cutoff))
loadings.X <- loadings.X[above.cutoff.X,]
loadings.Y <- loadings.Y[above.cutoff.Y,]
# only representing the samples in the X space
variates <- object$variates$X
variates <- data.frame(variates)
## scaler of var vs sample coordinates - added one for each data set
scaler.X <- max(object$variates$X, na.rm = TRUE)/max(abs(cors.X), na.rm = TRUE)
scaler.Y <- max(object$variates$Y, na.rm = TRUE)/max(abs(cors.Y), na.rm = TRUE)
axes.titles <- c('Comp 1', 'Comp 2')
## ------------- outline of plot -----
gg_biplot <-
ggplot() +
theme_classic() +
labs(x = axes.titles[1],
y = axes.titles[2])
# ## vline and hline - you may want to comment this if you dont want these lines
gg_biplot <- gg_biplot + geom_vline(xintercept = 0, size = 0.3, col = 'grey75')
gg_biplot <- gg_biplot + geom_hline(yintercept = 0, size = 0.3, col = 'grey75')
gg_biplot
# ------
# PLOT SAMPLES
gg_biplot <- gg_biplot +
geom_point(aes(x = variates[, comp1],
y = variates[, comp2], col = col, shape = pch),
size = 2,
show.legend = FALSE) +
scale_color_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
)
gg_biplot
# PLOT VARIABLES
# the correlations are rescaled - need to fiddle a bit here
cors.X <- cors.X*scaler.X*0.7
cors.Y <- cors.Y*scaler.Y*0.8
## lines and arrows
# X variables
gg_biplot <-
gg_biplot + geom_segment(
aes(
x = 0,
y = 0,
xend = cors.X[,comp1],
yend = cors.X[,comp2],
),
col = var.arrow.col.X,
arrow = arrow(length = unit(var.arrow.length, "cm")),
size = var.arrow.size,
show.legend = FALSE
)
gg_biplot
# Y variables
gg_biplot <-
gg_biplot + geom_segment(
aes(
x = 0,
y = 0,
xend = cors.Y[,comp1],
yend = cors.Y[,comp2],
),
col = var.arrow.col.Y,
arrow = arrow(length = unit(var.arrow.length, "cm")),
size = var.arrow.size,
show.legend = FALSE
)
gg_biplot
## labels of X variables
var.labels.X <- rownames(loadings.X)
gg_biplot <-
gg_biplot + geom_text_repel(
aes(
x = cors.X[, comp1],
y = cors.X[, comp2],
label = var.labels.X),
col = var.arrow.col.X)
## labels of Y variables
var.labels.Y <- rownames(loadings.Y)
gg_biplot <-
gg_biplot + geom_text_repel(
aes(
x = cors.Y[, comp1],
y = cors.Y[, comp2],
label = var.labels.Y),
col = var.arrow.col.Y)
MINT_sPLS_GOs_biplot <- gg_biplot
MINT_sPLS_GOs_biplot
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
But let’s visualise these correlations as a heatmap:
# I want longer names, not just family and genus:
MINT_sPLS_ind_names_cim <- left_join(otu_table(megan_genus_clr) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_clr) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
unite(taxonomy, c(Class, Order, Family, Genus, Species), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
MINT_sPLS_ind_names_cim <- as.character(MINT_sPLS_ind_names_cim$taxonomy)
# I am just making this object (cim_mint.spls2.WQ.taxa.OTUs) to merge with the stability scores based on OTU IDs
cim_mint.spls2.WQ.taxa.OTUs <- cim(mint.spls2.WQ.taxa,
comp = 1:2,
xlab = "WQ parameters",
ylab = "Indicator microbial taxa",
margins = c(19, # bottom
28), # right
# row.names = MINT_sPLS_ind_names_cim,
symkey = FALSE,
keysize = c(1, 0.4),
title = "MINT sPLS Taxa/WQ (PCs 1 and 2)")
MINT sPLS - Taxa/WQ. Microbial taxa that were identified as indicators are shown on the y axis, while WQ measurements are shown on the x axis. These molecular signatures are shared across the four sampling transects. The scale shows similarity values (partial correlations) between the X and Y variables selected across the first two MINT sPLS dimensions, and clustered with a complete Euclidean distance method. The color indicates either positive (red) or negative (blue) correlation.
# I want longer names, not just family and genus:
MINT_sPLS_GOs_ind_names_cim <- left_join(otu_table(megan_go_clr_5) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_go_clr_5) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
unite(Functions, c(Rank3, Rank4, Rank5, Rank6), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
MINT_sPLS_GOs_ind_names_cim <- as.character(MINT_sPLS_GOs_ind_names_cim$Functions)
# I am just making this object (cim_mint.spls2.WQ.GOs) to merge with the stability scores based on OTU IDs
cim_mint.spls2.WQ.GOs <- cim(mint.spls2.WQ.GOs,
comp = 1:2,
xlab = "WQ parameters",
ylab = "Indicator microbial GO terms (genes/functions)",
margins = c(19, # bottom
50), # right
# row.names = MINT_sPLS_GOs_ind_names_cim,
symkey = FALSE,
keysize = c(1, 0.4),
title = "MINT sPLS GOs/WQ (PCs 1 and 2)")
MINT sPLS - GOs/WQ. Microbial GO terms (genes and functions) that were identified as indicators are shown on the y axis, while WQ measurements are shown on the x axis. These molecular signatures are shared across the four sampling transects. The scale shows similarity values (partial correlations) between the X and Y variables selected across the first two sPLS dimensions, and clustered with a complete Euclidean distance method. The color indicates either positive (red) or negative (blue) correlation.
Bellow is an example code from Kim-Anh to compute stability scores of indicators taxa/genes (selected on MINT sPLS dimension 1) across sampling trips.
library(mixOmics)
data(stemcells)
# 4 studies
summary(stemcells$study)
## STABILITY analysis, just learn on 3 data sets at a time, leave one out study
#e.g. here removing study k
list.selected = NULL # initialise, then we will store the selected genes at each iteration
for(k in 1:4){ # each run: remove study k
train.studies = which(stemcells$study != k)
X = stemcells$gene[train.studies,]
Y = stemcells$celltype[train.studies]
studies = droplevels(stemcells$study[train.studies])
# do a few checks (here this is not extensive!)
summary(Y)
summary(studies)
res.train = mint.splsda(X = X, Y = Y, ncomp = 2, study = studies, keepX = c(50,50))
# append selected genes
list.selected = c(list.selected, selectVar(res.train)$name, comp = 1)
}
length(list.selected) # ok, we should have 50 genes selected on comp 1 * 4 runs = 200
table(list.selected)/4 # where 4 is the number of runs / studies we have done
sort(table(list.selected)/4, decreasing = TRUE)
We applied this on indicator taxa:
# 4 trips
summary(mint.spls2.WQ.taxa$study)
## Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020
## 44 48 43
## Trip_04_July_2020
## 56
IMOS_studies <- c("Trip_01_Nov-Dec_2019",
"Trip_02_January_2020",
"Trip_03_February_2020",
"Trip_04_July_2020")
## STABILITY analysis, just learn on 3 data sets at a time, leave one out study
#e.g. here removing study k
list.selected = NULL # initialise, then we will store the selected genes at each iteration
for(k in IMOS_studies){ # each run: remove study k
train.studies = which(mint.spls2.WQ.taxa$study != k)
X = OTUs_biplot_names[train.studies,]
Y = metadata_MINT_biplot[train.studies,]
IMOS.studies = droplevels(mint.spls2.WQ.taxa$study[train.studies])
# do a few checks (here this is not extensive!)
summary(Y)
summary(IMOS.studies)
res.train = mint.spls(X = X, Y = Y, ncomp = 2, study = IMOS.studies, keepX = c(50,50))
# append selected genes
list.selected = c(list.selected, selectVar(res.train)$X$name, comp = 1)
}
# Saving this as a separate object for taxa
list.selected.taxa <- list.selected
length(list.selected.taxa) # ok, we should have 50 genes selected on comp 1 * 4 runs = 200
## [1] 227
table(list.selected.taxa)/4 # where 4 is the number of runs / studies we have done
## list.selected.taxa
## 1
## 1.00
## 1046_Chromatiaceae; Unknown Genus
## 0.25
## 1068904_Roseobacteraceae; Primorskyibacter
## 0.25
## 1080193_Flavobacteriaceae; Hyunsoonleella
## 0.25
## 112040_Flavobacteriaceae; Zobellia
## 0.25
## 1123951_Phyllobacteriaceae; Thalassocella
## 0.25
## 1150_Unknown Family; Unknown Genus
## 0.75
## 1161_Unknown Family; Unknown Genus
## 0.25
## 1172191_Alteromonadaceae; Catenovulum
## 0.25
## 118_Planctomycetaceae; Planctomyces
## 0.25
## 118968_Coxiellaceae; Unknown Genus
## 0.25
## 119045_Methylobacteriaceae; Unknown Genus
## 0.25
## 119060_Burkholderiaceae; Unknown Genus
## 0.50
## 1195766_Rhodobacteraceae; Planktotalea
## 0.25
## 1211036_Flavobacteriaceae; Mangrovimonas
## 0.25
## 1220535_Unknown Family; Unknown Genus
## 0.25
## 1224_Unknown Family; Unknown Genus
## 0.75
## 1236_Unknown Family; Unknown Genus
## 0.25
## 1246884_Robiginitomaculaceae; Algimonas
## 0.25
## 125216_Acetobacteraceae; Roseomonas
## 0.25
## 125287_Ornithinimicrobiaceae; Ornithinimicrobium
## 0.25
## 1263978_Rhodospirillaceae; Candidatus Endolissoclinum
## 0.50
## 12916_Comamonadaceae; Acidovorax
## 0.25
## 1331809_Kordiimonadaceae; Unknown Genus
## 0.25
## 1341118_Halieaceae; Luminiphilus
## 0.75
## 135613_Unknown Family; Unknown Genus
## 0.25
## 135617_Thiotrichaceae; Unknown Genus
## 0.25
## 135619_Unknown Family; Unknown Genus
## 0.75
## 135622_Unknown Family; Unknown Genus
## 0.75
## 1389453_Candidatus Actinomarinaceae; Candidatus Actinomarina
## 0.25
## 1400386_Lacipirellulaceae; Bythopirellula
## 0.25
## 1406885_Alteromonadaceae; Aliiglaciecola
## 0.50
## 1434034_Flavobacteriaceae; Pricia
## 0.25
## 1443919_Rhodobacteraceae; Tabrizicola
## 0.25
## 1458928_Oscillatoriaceae; Okeania
## 0.25
## 146_Spirochaetaceae; Spirochaeta
## 0.25
## 1471398_Prolixibacteraceae; Unknown Genus
## 0.25
## 1484898_Hyphomicrobiaceae; Methyloceanibacter
## 0.25
## 149698_Oxalobacteraceae; Massilia
## 0.25
## 1501348_Amoebophilaceae; Unknown Genus
## 0.25
## 150830_Stappiaceae; Roseibium
## 0.25
## 152180_Ahrensiaceae; Ahrensia
## 0.25
## 1524249_Unknown Family; Pseudohongiella
## 0.25
## 1553903_Oligoflexaceae; Oligoflexus
## 0.25
## 1564515_Haliscomenobacteraceae; Phaeodactylibacter
## 0.25
## 159345_Roseobacteraceae; Roseibacterium
## 0.25
## 1608457_Rhodobacteraceae; Aestuariivita
## 0.25
## 1617805_Rhodobacteraceae; Amylibacter
## 0.25
## 165697_Sphingomonadaceae; Sphingopyxis
## 0.50
## 167375_Prochlorococcaceae; Cyanobium
## 0.25
## 1676142_Wenzhouxiangellaceae; Wenzhouxiangella
## 0.25
## 1680826_Candidatus Thalassarchaeaceae; Candidatus Thalassarchaeum
## 0.75
## 1706369_Unknown Family; Unknown Genus
## 0.25
## 171436_Rhodospirillaceae; Tistrella
## 0.25
## 171552_Prevotellaceae; Unknown Genus
## 0.25
## 1716_Corynebacteriaceae; Corynebacterium
## 0.25
## 1752734_Unknown Family; Unknown Genus
## 0.25
## 1760_Unknown Family; Unknown Genus
## 0.25
## 1775411_Rhodanobacteraceae; Unknown Genus
## 0.25
## 1792291_Cellvibrionaceae; Marinagarivorans
## 0.25
## 1803399_Unknown Family; Candidatus Peribacter
## 0.50
## 1804663_Rhodospirillaceae; Haematospirillum
## 0.25
## 1813606_Balneolaceae; Unknown Genus
## 0.25
## 1822464_Burkholderiaceae; Paraburkholderia
## 0.25
## 182709_Bacillaceae; Oceanobacillus
## 0.25
## 183963_Unknown Family; Unknown Genus
## 0.25
## 1847_Pseudonocardiaceae; Pseudonocardia
## 0.25
## 1853232_Hymenobacteraceae; Unknown Genus
## 0.25
## 186650_Methylobacteriaceae; Microvirga
## 0.25
## 186801_Unknown Family; Unknown Genus
## 0.25
## 186802_Unknown Family; Unknown Genus
## 0.75
## 186822_Paenibacillaceae; Unknown Genus
## 0.25
## 1869227_Unknown Family; Unknown Genus
## 0.25
## 1890424_Unknown Family; Unknown Genus
## 0.75
## 1890426_Synechococcaceae; Unknown Genus
## 0.25
## 1915401_Phyllobacteriaceae; Roseitalea
## 0.25
## 191767_Nannocystaceae; Plesiocystis
## 0.25
## 1931200_Rhodobacteraceae; Marinibacterium
## 0.75
## 194_Campylobacteraceae; Campylobacter
## 0.25
## 194924_Desulfovibrionaceae; Unknown Genus
## 0.25
## 1960290_Sphingosinicellaceae; Pacificimonas
## 0.25
## 2_Unknown Family; Unknown Genus
## 0.75
## 200644_Unknown Family; Unknown Genus
## 0.25
## 202746_Thiovulaceae; Sulfurimonas
## 0.25
## 204428_Unknown Family; Unknown Genus
## 0.25
## 204455_Unknown Family; Unknown Genus
## 0.75
## 204456_Rhodobacteraceae; Gemmobacter
## 0.25
## 213421_Desulfuromonadaceae; Unknown Genus
## 0.25
## 213422_Geobacteraceae; Unknown Genus
## 0.25
## 2146_Acholeplasmataceae; Unknown Genus
## 0.50
## 2299_Desulfobacteraceae; Desulfosarcina
## 0.25
## 2383_Lachnospiraceae; Epulopiscium
## 0.25
## 244698_Flavobacteriaceae; Gillisia
## 0.25
## 245186_Roseobacteraceae; Loktanella
## 0.25
## 246873_Crocinitomicaceae; Crocinitomix
## 0.25
## 252356_Flavobacteriaceae; Maribacter
## 0.25
## 258255_Stappiaceae; Pseudovibrio
## 0.25
## 265488_Pirellulaceae; Rhodopirellula
## 0.25
## 265976_Ornithinimicrobiaceae; Serinicoccus
## 0.25
## 270_Thermaceae; Thermus
## 0.25
## 274591_Phyllobacteriaceae; Hoeflea
## 0.25
## 28105_Rhizobiaceae; Sinorhizobium
## 0.25
## 28211_Unknown Family; Unknown Genus
## 0.25
## 28216_Unknown Family; Unknown Genus
## 0.50
## 28221_Unknown Family; Unknown Genus
## 0.25
## 28222_Desulfobacteraceae; Desulfobacula
## 0.25
## 282682_Roseobacteraceae; Citreicella
## 0.25
## 28453_Sphingobacteriaceae; Sphingobacterium
## 0.25
## 286_Pseudomonadaceae; Pseudomonas
## 0.25
## 288021_Kordiimonadaceae; Kordiimonas
## 0.25
## 291183_Flavobacteriaceae; Lacinutrix
## 0.25
## 315422_Roseobacteraceae; Palleronia
## 0.25
## 316625_Cellvibrionaceae; Saccharophagus
## 0.25
## 31957_Propionibacteriaceae; Unknown Genus
## 0.25
## 31969_Unknown Family; Unknown Genus
## 0.50
## 31989_Rhodobacteraceae; Unknown Genus
## 0.75
## 32033_Xanthomonadaceae; Unknown Genus
## 0.25
## 335927_Roseobacteraceae; Thalassobius
## 0.25
## 335928_Xanthobacteraceae; Unknown Genus
## 0.25
## 336276_Flavobacteriaceae; Olleya
## 0.25
## 356_Unknown Family; Unknown Genus
## 0.75
## 357_Rhizobiaceae; Agrobacterium
## 0.25
## 366580_Alteromonadaceae; Bowmanella
## 0.25
## 367771_Roseobacteraceae; Marinovum
## 0.75
## 379068_Flavobacteriaceae; Galbibacter
## 0.25
## 379070_Flavobacteriaceae; Gilvibacter
## 0.25
## 404235_Roseobacteraceae; Maritimibacter
## 0.25
## 404432_Halomonadaceae; Salinicola
## 0.25
## 41275_Caulobacteraceae; Brevundimonas
## 0.25
## 41294_Bradyrhizobiaceae; Unknown Genus
## 0.25
## 41295_Rhodospirillaceae; Unknown Genus
## 0.75
## 417127_Flavobacteriaceae; Zunongwangia
## 0.25
## 42054_Halomonadaceae; Chromohalobacter
## 0.25
## 436357_Roseobacteraceae; Thalassococcus
## 0.25
## 437504_Granulosicoccaceae; Granulosicoccus
## 0.25
## 437506_Robiginitomaculaceae; Robiginitomaculum
## 0.25
## 45404_Beijerinckiaceae; Unknown Genus
## 0.25
## 455358_Balneolaceae; Balneola
## 0.75
## 468938_Puniceicoccaceae; Cerasicoccus
## 0.25
## 478070_Stappiaceae; Labrenzia
## 0.50
## 482_Neisseriaceae; Neisseria
## 0.25
## 49279_Flavobacteriaceae; Gelidibacter
## 0.25
## 51291_Unknown Family; Unknown Genus
## 0.25
## 543_Enterobacteriaceae; Unknown Genus
## 0.75
## 544448_Unknown Family; Unknown Genus
## 0.50
## 561_Enterobacteriaceae; Escherichia
## 0.50
## 568386_Sinobacteraceae; Unknown Genus
## 0.25
## 570_Enterobacteriaceae; Klebsiella
## 0.50
## 574899_Verrucomicrobiaceae; Haloferula
## 0.75
## 62680_Unknown Family; Unknown Genus
## 0.25
## 649462_Balneolaceae; Gracilimonas
## 0.25
## 655184_Unknown Family; Candidatus Thioglobus
## 0.25
## 655352_Cohaesibacteraceae; Cohaesibacter
## 0.25
## 65842_Unknown Family; Unknown Genus
## 0.25
## 72276_Ectothiorhodospiraceae; Unknown Genus
## 0.25
## 75_Caulobacteraceae; Caulobacter
## 0.25
## 759360_Oceanospirillaceae; Oleibacter
## 0.25
## 762641_Flavobacteriaceae; Muriicola
## 0.25
## 76831_Flavobacteriaceae; Myroides
## 0.25
## 80864_Comamonadaceae; Unknown Genus
## 0.25
## 80865_Comamonadaceae; Delftia
## 1.00
## 81_Hyphomicrobiaceae; Hyphomicrobium
## 0.25
## 82115_Rhizobiaceae; Unknown Genus
## 0.25
## 85413_Boseaceae; Bosea
## 0.25
## 866673_Marinifilaceae; Marinifilum
## 0.25
## 904708_Arenicellaceae; Arenicella
## 0.25
## 907197_Pseudoalteromonadaceae; Psychrosphaera
## 0.25
## 91347_Unknown Family; Unknown Genus
## 0.25
## 914_Nitrosomonadaceae; Nitrosomonas
## 0.25
## 976_Unknown Family; Unknown Genus
## 0.75
## 986106_Acidiferrobacteraceae; Acidiferrobacter
## 0.25
sort(table(list.selected.taxa)/4, decreasing = TRUE)
## list.selected.taxa
## 1
## 1.00
## 80865_Comamonadaceae; Delftia
## 1.00
## 1150_Unknown Family; Unknown Genus
## 0.75
## 1224_Unknown Family; Unknown Genus
## 0.75
## 1341118_Halieaceae; Luminiphilus
## 0.75
## 135619_Unknown Family; Unknown Genus
## 0.75
## 135622_Unknown Family; Unknown Genus
## 0.75
## 1680826_Candidatus Thalassarchaeaceae; Candidatus Thalassarchaeum
## 0.75
## 186802_Unknown Family; Unknown Genus
## 0.75
## 1890424_Unknown Family; Unknown Genus
## 0.75
## 1931200_Rhodobacteraceae; Marinibacterium
## 0.75
## 2_Unknown Family; Unknown Genus
## 0.75
## 204455_Unknown Family; Unknown Genus
## 0.75
## 31989_Rhodobacteraceae; Unknown Genus
## 0.75
## 356_Unknown Family; Unknown Genus
## 0.75
## 367771_Roseobacteraceae; Marinovum
## 0.75
## 41295_Rhodospirillaceae; Unknown Genus
## 0.75
## 455358_Balneolaceae; Balneola
## 0.75
## 543_Enterobacteriaceae; Unknown Genus
## 0.75
## 574899_Verrucomicrobiaceae; Haloferula
## 0.75
## 976_Unknown Family; Unknown Genus
## 0.75
## 119060_Burkholderiaceae; Unknown Genus
## 0.50
## 1263978_Rhodospirillaceae; Candidatus Endolissoclinum
## 0.50
## 1406885_Alteromonadaceae; Aliiglaciecola
## 0.50
## 165697_Sphingomonadaceae; Sphingopyxis
## 0.50
## 1803399_Unknown Family; Candidatus Peribacter
## 0.50
## 2146_Acholeplasmataceae; Unknown Genus
## 0.50
## 28216_Unknown Family; Unknown Genus
## 0.50
## 31969_Unknown Family; Unknown Genus
## 0.50
## 478070_Stappiaceae; Labrenzia
## 0.50
## 544448_Unknown Family; Unknown Genus
## 0.50
## 561_Enterobacteriaceae; Escherichia
## 0.50
## 570_Enterobacteriaceae; Klebsiella
## 0.50
## 1046_Chromatiaceae; Unknown Genus
## 0.25
## 1068904_Roseobacteraceae; Primorskyibacter
## 0.25
## 1080193_Flavobacteriaceae; Hyunsoonleella
## 0.25
## 112040_Flavobacteriaceae; Zobellia
## 0.25
## 1123951_Phyllobacteriaceae; Thalassocella
## 0.25
## 1161_Unknown Family; Unknown Genus
## 0.25
## 1172191_Alteromonadaceae; Catenovulum
## 0.25
## 118_Planctomycetaceae; Planctomyces
## 0.25
## 118968_Coxiellaceae; Unknown Genus
## 0.25
## 119045_Methylobacteriaceae; Unknown Genus
## 0.25
## 1195766_Rhodobacteraceae; Planktotalea
## 0.25
## 1211036_Flavobacteriaceae; Mangrovimonas
## 0.25
## 1220535_Unknown Family; Unknown Genus
## 0.25
## 1236_Unknown Family; Unknown Genus
## 0.25
## 1246884_Robiginitomaculaceae; Algimonas
## 0.25
## 125216_Acetobacteraceae; Roseomonas
## 0.25
## 125287_Ornithinimicrobiaceae; Ornithinimicrobium
## 0.25
## 12916_Comamonadaceae; Acidovorax
## 0.25
## 1331809_Kordiimonadaceae; Unknown Genus
## 0.25
## 135613_Unknown Family; Unknown Genus
## 0.25
## 135617_Thiotrichaceae; Unknown Genus
## 0.25
## 1389453_Candidatus Actinomarinaceae; Candidatus Actinomarina
## 0.25
## 1400386_Lacipirellulaceae; Bythopirellula
## 0.25
## 1434034_Flavobacteriaceae; Pricia
## 0.25
## 1443919_Rhodobacteraceae; Tabrizicola
## 0.25
## 1458928_Oscillatoriaceae; Okeania
## 0.25
## 146_Spirochaetaceae; Spirochaeta
## 0.25
## 1471398_Prolixibacteraceae; Unknown Genus
## 0.25
## 1484898_Hyphomicrobiaceae; Methyloceanibacter
## 0.25
## 149698_Oxalobacteraceae; Massilia
## 0.25
## 1501348_Amoebophilaceae; Unknown Genus
## 0.25
## 150830_Stappiaceae; Roseibium
## 0.25
## 152180_Ahrensiaceae; Ahrensia
## 0.25
## 1524249_Unknown Family; Pseudohongiella
## 0.25
## 1553903_Oligoflexaceae; Oligoflexus
## 0.25
## 1564515_Haliscomenobacteraceae; Phaeodactylibacter
## 0.25
## 159345_Roseobacteraceae; Roseibacterium
## 0.25
## 1608457_Rhodobacteraceae; Aestuariivita
## 0.25
## 1617805_Rhodobacteraceae; Amylibacter
## 0.25
## 167375_Prochlorococcaceae; Cyanobium
## 0.25
## 1676142_Wenzhouxiangellaceae; Wenzhouxiangella
## 0.25
## 1706369_Unknown Family; Unknown Genus
## 0.25
## 171436_Rhodospirillaceae; Tistrella
## 0.25
## 171552_Prevotellaceae; Unknown Genus
## 0.25
## 1716_Corynebacteriaceae; Corynebacterium
## 0.25
## 1752734_Unknown Family; Unknown Genus
## 0.25
## 1760_Unknown Family; Unknown Genus
## 0.25
## 1775411_Rhodanobacteraceae; Unknown Genus
## 0.25
## 1792291_Cellvibrionaceae; Marinagarivorans
## 0.25
## 1804663_Rhodospirillaceae; Haematospirillum
## 0.25
## 1813606_Balneolaceae; Unknown Genus
## 0.25
## 1822464_Burkholderiaceae; Paraburkholderia
## 0.25
## 182709_Bacillaceae; Oceanobacillus
## 0.25
## 183963_Unknown Family; Unknown Genus
## 0.25
## 1847_Pseudonocardiaceae; Pseudonocardia
## 0.25
## 1853232_Hymenobacteraceae; Unknown Genus
## 0.25
## 186650_Methylobacteriaceae; Microvirga
## 0.25
## 186801_Unknown Family; Unknown Genus
## 0.25
## 186822_Paenibacillaceae; Unknown Genus
## 0.25
## 1869227_Unknown Family; Unknown Genus
## 0.25
## 1890426_Synechococcaceae; Unknown Genus
## 0.25
## 1915401_Phyllobacteriaceae; Roseitalea
## 0.25
## 191767_Nannocystaceae; Plesiocystis
## 0.25
## 194_Campylobacteraceae; Campylobacter
## 0.25
## 194924_Desulfovibrionaceae; Unknown Genus
## 0.25
## 1960290_Sphingosinicellaceae; Pacificimonas
## 0.25
## 200644_Unknown Family; Unknown Genus
## 0.25
## 202746_Thiovulaceae; Sulfurimonas
## 0.25
## 204428_Unknown Family; Unknown Genus
## 0.25
## 204456_Rhodobacteraceae; Gemmobacter
## 0.25
## 213421_Desulfuromonadaceae; Unknown Genus
## 0.25
## 213422_Geobacteraceae; Unknown Genus
## 0.25
## 2299_Desulfobacteraceae; Desulfosarcina
## 0.25
## 2383_Lachnospiraceae; Epulopiscium
## 0.25
## 244698_Flavobacteriaceae; Gillisia
## 0.25
## 245186_Roseobacteraceae; Loktanella
## 0.25
## 246873_Crocinitomicaceae; Crocinitomix
## 0.25
## 252356_Flavobacteriaceae; Maribacter
## 0.25
## 258255_Stappiaceae; Pseudovibrio
## 0.25
## 265488_Pirellulaceae; Rhodopirellula
## 0.25
## 265976_Ornithinimicrobiaceae; Serinicoccus
## 0.25
## 270_Thermaceae; Thermus
## 0.25
## 274591_Phyllobacteriaceae; Hoeflea
## 0.25
## 28105_Rhizobiaceae; Sinorhizobium
## 0.25
## 28211_Unknown Family; Unknown Genus
## 0.25
## 28221_Unknown Family; Unknown Genus
## 0.25
## 28222_Desulfobacteraceae; Desulfobacula
## 0.25
## 282682_Roseobacteraceae; Citreicella
## 0.25
## 28453_Sphingobacteriaceae; Sphingobacterium
## 0.25
## 286_Pseudomonadaceae; Pseudomonas
## 0.25
## 288021_Kordiimonadaceae; Kordiimonas
## 0.25
## 291183_Flavobacteriaceae; Lacinutrix
## 0.25
## 315422_Roseobacteraceae; Palleronia
## 0.25
## 316625_Cellvibrionaceae; Saccharophagus
## 0.25
## 31957_Propionibacteriaceae; Unknown Genus
## 0.25
## 32033_Xanthomonadaceae; Unknown Genus
## 0.25
## 335927_Roseobacteraceae; Thalassobius
## 0.25
## 335928_Xanthobacteraceae; Unknown Genus
## 0.25
## 336276_Flavobacteriaceae; Olleya
## 0.25
## 357_Rhizobiaceae; Agrobacterium
## 0.25
## 366580_Alteromonadaceae; Bowmanella
## 0.25
## 379068_Flavobacteriaceae; Galbibacter
## 0.25
## 379070_Flavobacteriaceae; Gilvibacter
## 0.25
## 404235_Roseobacteraceae; Maritimibacter
## 0.25
## 404432_Halomonadaceae; Salinicola
## 0.25
## 41275_Caulobacteraceae; Brevundimonas
## 0.25
## 41294_Bradyrhizobiaceae; Unknown Genus
## 0.25
## 417127_Flavobacteriaceae; Zunongwangia
## 0.25
## 42054_Halomonadaceae; Chromohalobacter
## 0.25
## 436357_Roseobacteraceae; Thalassococcus
## 0.25
## 437504_Granulosicoccaceae; Granulosicoccus
## 0.25
## 437506_Robiginitomaculaceae; Robiginitomaculum
## 0.25
## 45404_Beijerinckiaceae; Unknown Genus
## 0.25
## 468938_Puniceicoccaceae; Cerasicoccus
## 0.25
## 482_Neisseriaceae; Neisseria
## 0.25
## 49279_Flavobacteriaceae; Gelidibacter
## 0.25
## 51291_Unknown Family; Unknown Genus
## 0.25
## 568386_Sinobacteraceae; Unknown Genus
## 0.25
## 62680_Unknown Family; Unknown Genus
## 0.25
## 649462_Balneolaceae; Gracilimonas
## 0.25
## 655184_Unknown Family; Candidatus Thioglobus
## 0.25
## 655352_Cohaesibacteraceae; Cohaesibacter
## 0.25
## 65842_Unknown Family; Unknown Genus
## 0.25
## 72276_Ectothiorhodospiraceae; Unknown Genus
## 0.25
## 75_Caulobacteraceae; Caulobacter
## 0.25
## 759360_Oceanospirillaceae; Oleibacter
## 0.25
## 762641_Flavobacteriaceae; Muriicola
## 0.25
## 76831_Flavobacteriaceae; Myroides
## 0.25
## 80864_Comamonadaceae; Unknown Genus
## 0.25
## 81_Hyphomicrobiaceae; Hyphomicrobium
## 0.25
## 82115_Rhizobiaceae; Unknown Genus
## 0.25
## 85413_Boseaceae; Bosea
## 0.25
## 866673_Marinifilaceae; Marinifilum
## 0.25
## 904708_Arenicellaceae; Arenicella
## 0.25
## 907197_Pseudoalteromonadaceae; Psychrosphaera
## 0.25
## 91347_Unknown Family; Unknown Genus
## 0.25
## 914_Nitrosomonadaceae; Nitrosomonas
## 0.25
## 986106_Acidiferrobacteraceae; Acidiferrobacter
## 0.25
Plotting MINT sPLS stability scores together with the heatmap, for taxa:
# Row names in this object (mint.spls2.WQ.taxa.mat.cor) is how indicator taxa are ordered in the heatmap, Use this when setting the level in the aes() of ggplot2
# This is the table with Stability scores on dimension 1
MINT_sPLS_dim1_stability <- as.data.frame(table(list.selected.taxa)/4) %>%
separate(col = "list.selected.taxa", # I am splitting this column
sep = "_", # This is the separator
into = c("OTU", "taxa")
)
# Removing the taxa column - not needed:
MINT_sPLS_dim1_stability$taxa <- NULL
# getting names for taxa
OTUs_biplot_colnames <- left_join(otu_table(megan_genus_clr) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_clr) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
unite(taxonomy, c(Family, Genus), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
OTUs_biplot_colnames <- OTUs_biplot_colnames %>%
dplyr::select("OTU", "taxonomy")
# Merging:
MINT_sPLS_dim1_stability <- left_join(MINT_sPLS_dim1_stability,
OTUs_biplot_colnames)
# Re-running MINT - I need OTUs only as names (not OTUs and taxonomy)
MINT_sPLS_similarity_scores_and_LOGOCV_taxa <- mint.spls(X = OTUs_biplot,
Y = sample_data(megan_genus_clr)[,24:40],
ncomp = 2,
study = sample_data(megan_genus_clr)$Sampling_trip,
keepX = keepX, # 50 taxa on dims 1 and 2
mode = "regression")
# I am just making this object (cim_mint.spls2.WQ.taxa.OTUs) to merge with the stability scores based on OTU IDs
MINT_sPLS_similarity_scores_and_LOGOCV_taxa <- cim(MINT_sPLS_similarity_scores_and_LOGOCV_taxa,
comp = 1:2,
xlab = "WQ parameters",
ylab = "Indicator microbial taxa",
margins = c(19, # bottom
27), # right
# row.names = MINT_sPLS_ind_names_cim,
symkey = FALSE,
keysize = c(1, 0.4),
title = "MINT sPLS Taxa/WQ (PCs 1 and 2)")
# Extracting the correlation matrix from the MINT sPLS heatmap
mint.spls2.WQ.taxa.mat.cor <- MINT_sPLS_similarity_scores_and_LOGOCV_taxa$mat.cor
# Merging them!
MINT_sPLS_dim1_stability_merged <- left_join(as.data.frame(mint.spls2.WQ.taxa.mat.cor) %>% rownames_to_column("OTU"),
MINT_sPLS_dim1_stability) # %>%
# filter(if_any(everything(), ~ !is.na(Freq))) # removing those that have NAs as stability scores
# Barplots
MINT_sPLS_stability_plots_ordered.dim1 <- MINT_sPLS_dim1_stability_merged[,c(1, 19, 20)] %>% # Selecting OTU (1), Freq (19), and taxonomy (20)
ggplot(aes(y = factor(OTU, level = unique(row.names(as.data.frame(mint.spls2.WQ.taxa.mat.cor)))),
x = Freq,
fill =Freq > 0.25)) + # from Largest to smallest Stability
geom_bar(stat = "identity") +
scale_fill_manual(values = c("FALSE" = "grey40", "TRUE" = "seagreen3"), guide = FALSE) +
# scale_y_discrete(limits=rev) + # Reversing the order to match the heatmap
labs(y = 'MINT sPLS Indicator Microbes',
x = "LOGOCV Stability score - dim 1",
title = 'MINT sPLS Are these signals shared across trips?',
subtitle = 'Leave One Group Out Cross Validation (LOGOCV)') +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12), legend.position = "NONE")
MINT_sPLS_stability_plots_ordered.dim1
# Now exporting this for RawGraphs, this can go in Supplementary Material
RawGraphs_shared_taxa_MINT <- MINT_sPLS_stability_plots_ordered.dim1$data %>%
as.data.frame() %>%
left_join(.,
tax_table(megan_genus_clr) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
# unite(full_taxonomy, c(Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") %>% # Adding Taxonomy info
dplyr::filter(if_any(everything(), ~ !is.na(Freq)))
# Exporting as csv
write.csv(RawGraphs_shared_taxa_MINT, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/RawGraphs_shared_taxa_MINT.csv", quote = F, row.names = F)
and indicator GO terms:
# 4 trips
summary(mint.spls2.WQ.GOs$study)
## Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020
## 44 48 43
## Trip_04_July_2020
## 56
IMOS_studies <- c("Trip_01_Nov-Dec_2019",
"Trip_02_January_2020",
"Trip_03_February_2020",
"Trip_04_July_2020")
## STABILITY analysis, just learn on 3 data sets at a time, leave one out study
#e.g. here removing study k
list.selected = NULL # initialise, then we will store the selected genes at each iteration
for(k in IMOS_studies){ # each run: remove study k
train.studies = which(mint.spls2.WQ.GOs$study != k)
X = GOs_biplot_names[train.studies,]
Y = metadata_MINT_biplot[train.studies,]
IMOS.studies = droplevels(mint.spls2.WQ.GOs$study[train.studies])
# do a few checks (here this is not extensive!)
summary(Y)
summary(IMOS.studies)
res.train = mint.spls(X = X, Y = Y, ncomp = 2, study = IMOS.studies, keepX = c(50,50))
# append selected genes
list.selected = c(list.selected, selectVar(res.train)$X$name, comp = 1)
}
# Saving this as a separate object for taxa
list.selected.GOs <- list.selected
length(list.selected.GOs) # ok, we should have 50 genes selected on comp 1 * 4 runs = 200
## [1] 204
table(list.selected.GOs)/4 # where 4 is the number of runs / studies we have done
## list.selected.GOs
## 1
## 1.00
## 10074_GO:0009058 biosynthetic process
## 0.25
## 10108_GO:0009058 biosynthetic process
## 0.25
## 1015_GO:0009058 biosynthetic process
## 0.25
## 10226_IPR010226 NADH-quinone oxidoreductase, chain I
## 1.00
## 10228_GO:0016491 oxidoreductase activity
## 0.25
## 1036_IPR001036 Acriflavin resistance protein
## 1.00
## 10404_GO:0006807 nitrogen compound metabolic process
## 0.25
## 1062_IPR001062 Transcription antitermination protein, NusG
## 0.25
## 1063_GO:0006412 translation
## 0.25
## 11284_IPR011284 3-oxoacyl-(acyl-carrier-protein) reductase
## 0.25
## 115_GO:0009058 biosynthetic process
## 0.25
## 11537_IPR011537 NADH ubiquinone oxidoreductase, F subunit
## 0.50
## 11701_IPR011701 Major facilitator superfamily
## 0.25
## 11806_IPR011806 Sulphite reductase, dissimilatory-type alpha subunit
## 0.25
## 1182_IPR001182 Probable peptidoglycan glycosyltransferase FtsW/RodA
## 0.25
## 11864_IPR011864 Phosphate ABC transporter, permease protein PstC
## 0.50
## 11890_IPR024704 Structural maintenance of chromosomes protein
## 0.25
## 119_IPR000119 Histone-like DNA-binding protein
## 0.25
## 11900_IPR011900 Glutaredoxin, GrxC
## 0.25
## 1209_GO:0006412 translation
## 0.25
## 12098_IPR012098 SRP-independent targeting protein 3
## 0.25
## 12099_GO:0065003 protein-containing complex assembly
## 0.25
## 12147_GO:0016740 transferase activity
## 0.25
## 12245_GO:0009058 biosynthetic process
## 0.50
## 13025_GO:0006412 translation
## 0.50
## 131_GO:0009058 biosynthetic process
## 0.25
## 13765_IPR013765 DNA recombination and repair protein RecA
## 0.25
## 13954_NA
## 0.25
## 14105_GO:0009058 biosynthetic process
## 0.25
## 14358_IPR014358 Enoyl-[acyl-carrier-protein] reductase (NADH)
## 1.00
## 1441_GO:0016740 transferase activity
## 0.25
## 14434_IPR014434 Monothiol glutaredoxin
## 0.25
## 15_IPR000015 Outer membrane usher protein
## 0.25
## 15815_GO:0016491 oxidoreductase activity
## 0.50
## 1591_IPR001591 Influenza RNA-dependent RNA polymerase subunit PB2
## 1.00
## 16299_GO:0009058 biosynthetic process
## 0.25
## 16484_IPR016484 GTP-binding protein EngA
## 0.50
## 16932_IPR016932 Uncharacterised conserved protein UCP029669
## 0.25
## 17244_IPR017244 Ribosomal RNA large subunit methyltransferase K/L
## 0.50
## 17649_GO:0006807 nitrogen compound metabolic process
## 0.25
## 17666_IPR017666 2-aminoethylphosphonate ABC transport system, ATP-binding component PhnT2
## 0.25
## 17847_IPR017847 Type VI secretion system, RhsGE-associated Vgr family subset
## 0.25
## 19007_IPR019007 WW domain binding protein 11
## 0.25
## 19407_IPR019407 Cytoplasmic tRNA 2-thiolation protein 2
## 0.25
## 1951_IPR001951 Histone H4
## 0.25
## 1971_GO:0006412 translation
## 0.50
## 19791_NA
## 0.50
## 2033_IPR002033 Sec-independent periplasmic protein translocase TatC
## 0.25
## 20761_IPR020761 Uncharacterised protein family UPF0114, bacteria
## 0.25
## 20921_GO:0009058 biosynthetic process
## 0.25
## 20948_NA
## 0.25
## 21120_GO:0016853 isomerase activity
## 0.25
## 2132_GO:0006412 translation
## 0.50
## 2141_IPR002141 Influenza virus nucleoprotein (NP)
## 0.25
## 2150_GO:0006412 translation
## 0.25
## 218_GO:0006412 translation
## 0.50
## 2196_GO:0006807 nitrogen compound metabolic process
## 0.25
## 22270_GO:0016491 oxidoreductase activity
## 0.75
## 22271_NA
## 0.25
## 22941_IPR022941 Signal recognition particle, SRP54 subunit
## 0.25
## 2301_GO:0044281 small molecule metabolic process
## 0.25
## 2302_GO:0044281 small molecule metabolic process
## 0.25
## 2303_GO:0044281 small molecule metabolic process
## 0.25
## 23473_NA
## 0.25
## 2381_IPR002381 Ribonuclease PH, bacterial-type
## 0.50
## 24791_GO:0006091 generation of precursor metabolites and energy
## 0.25
## 2504_GO:0009058 biosynthetic process
## 0.25
## 2549_NA
## 0.25
## 25703_GO:0006807 nitrogen compound metabolic process
## 0.25
## 26030_IPR001248 Purine-cytosine permease
## 0.25
## 27078_IPR027078 Small nuclear ribonucleoprotein E
## 0.25
## 27185_IPR017241 Toll-like receptor
## 0.50
## 2755_GO:0009058 biosynthetic process
## 0.25
## 2781_IPR002781 Transmembrane protein TauE-like
## 0.50
## 28268_IPR028268 Pianissimo family
## 0.50
## 2842_IPR002842 V-type ATPase subunit E
## 0.25
## 28927_NA
## 0.25
## 2975_IPR001019 Guanine nucleotide binding protein (G-protein), alpha subunit
## 0.25
## 29751_GO:0006412 translation
## 0.25
## 2994_IPR002994 Surfeit locus 1/Shy1
## 0.25
## 30559_IPR030559 DNA polymerase zeta catalytic subunit
## 0.25
## 31463_IPR031463 MICOS complex subunit Mic12
## 0.25
## 3170_GO:0016491 oxidoreductase activity
## 0.25
## 31723_GO:0016829 lyase activity
## 0.75
## 3329_NA
## 0.25
## 3448_GO:0009058 biosynthetic process
## 0.25
## 3544_IPR003544 Cytochrome c-type biogenesis protein CcmB
## 0.25
## 3669_GO:0009058 biosynthetic process
## 0.25
## 3673_GO:0016740 transferase activity
## 0.25
## 3724_GO:0009058 biosynthetic process
## 0.25
## 3752_IPR003752 Disulphide bond formation protein DsbB/BdbC
## 0.25
## 3758_GO:0009058 biosynthetic process
## 0.25
## 3764_IPR003764 N-acetylglucosamine-6-phosphate deacetylase
## 0.25
## 3837_IPR003837 Glu-tRNAGln amidotransferase C subunit
## 0.25
## 394_IPR000394 RNA polymerase sigma factor 54
## 0.25
## 4373_IPR004373 Peptide chain release factor 1
## 0.25
## 4506_IPR004506 tRNA-specific 2-thiouridylase
## 0.25
## 4528_IPR004528 3-deoxy-D-manno-octulosonate cytidylyltransferase
## 0.25
## 453_GO:0009058 biosynthetic process
## 0.25
## 4536_IPR004536 Selenophosphate synthetase
## 0.25
## 4569_GO:0009058 biosynthetic process
## 0.25
## 4607_GO:0009058 biosynthetic process
## 0.25
## 4625_GO:0009058 biosynthetic process
## 0.50
## 4695_IPR004695 Transporter protein SLAC1/Mae1/ Ssu1/TehA
## 0.25
## 4769_IPR004769 Adenylosuccinate lyase
## 0.25
## 4792_NA
## 0.25
## 4811_GO:0006807 nitrogen compound metabolic process
## 0.50
## 4835_GO:0016740 transferase activity
## 0.25
## 4903_IPR004903 Lactobacillus surface layer protein
## 0.25
## 4923_IPR004923 Iron permease FTR1/Fip1/EfeU
## 0.25
## 5128_GO:0009058 biosynthetic process
## 0.25
## 5133_IPR005133 Na+/H+ antiporter subunit G
## 0.75
## 5150_GO:0009058 biosynthetic process
## 1.00
## 5255_IPR005255 PdxA family
## 0.25
## 529_GO:0006412 translation
## 0.25
## 5338_GO:0006807 nitrogen compound metabolic process
## 0.50
## 5650_IPR005650 BlaI transcriptional regulatory family
## 0.25
## 5670_IPR005670 Phosphate transport system permease protein 1
## 0.25
## 5704_GO:0006412 translation
## 0.25
## 5759_IPR005759 Endonuclease III
## 0.25
## 577_GO:0005975 carbohydrate metabolic process
## 0.25
## 5813_GO:0006412 translation
## 0.25
## 5840_IPR005839 Methylthiotransferase
## 0.25
## 5930_GO:0009058 biosynthetic process
## 0.25
## 5967_IPR005948 Thiamine/thiamin pyrophosphate-binding periplasmic protein, ABC transporter
## 0.25
## 597_GO:0006412 translation
## 0.25
## 5982_IPR005982 Thioredoxin reductase
## 0.25
## 5996_GO:0006412 translation
## 0.25
## 6032_GO:0006412 translation
## 0.25
## 6035_GO:0046872 metal ion binding
## 0.25
## 6130_GO:0006807 nitrogen compound metabolic process
## 0.50
## 6298_IPR006298 GTP-binding protein TypA
## 0.25
## 630_GO:0006412 translation
## 0.25
## 639_IPR000639 Epoxide hydrolase-like
## 0.25
## 653_NA
## 0.25
## 682_GO:0006807 nitrogen compound metabolic process
## 0.25
## 7016_NA
## 0.50
## 7225_IPR007225 Exocyst complex component EXOC6/Sec15
## 0.25
## 7269_GO:0006807 nitrogen compound metabolic process
## 0.25
## 7305_IPR007305 Vesicle transport protein, Got1/SFT2-like
## 0.25
## 7315_GO:0009058 biosynthetic process
## 0.25
## 7325_GO:0006807 nitrogen compound metabolic process
## 0.50
## 7375_NA
## 0.25
## 7466_GO:0009058 biosynthetic process
## 0.25
## 748_IPR000748 Pseudouridine synthase, RsuA/RluB/E/F
## 0.25
## 7533_GO:0046872 metal ion binding
## 0.25
## 7721_GO:0005975 carbohydrate metabolic process
## 0.25
## 7801_NA
## 0.25
## 7812_IPR007812 Type II secretion system protein GspL
## 0.25
## 8141_GO:0006807 nitrogen compound metabolic process
## 0.75
## 92_GO:0009058 biosynthetic process
## 0.25
## 926_GO:0009058 biosynthetic process
## 0.25
## 93_IPR000093 DNA recombination protein RecR
## 0.25
## 9311_IPR009311 Interferon alpha-inducible protein IFI6/IFI27-like
## 0.25
## 968_IPR000968 Influenza nuclear export protein NS2
## 0.25
## 9734_NA
## 0.25
sort(table(list.selected.GOs)/4, decreasing = TRUE)
## list.selected.GOs
## 1
## 1.00
## 10226_IPR010226 NADH-quinone oxidoreductase, chain I
## 1.00
## 1036_IPR001036 Acriflavin resistance protein
## 1.00
## 14358_IPR014358 Enoyl-[acyl-carrier-protein] reductase (NADH)
## 1.00
## 1591_IPR001591 Influenza RNA-dependent RNA polymerase subunit PB2
## 1.00
## 5150_GO:0009058 biosynthetic process
## 1.00
## 22270_GO:0016491 oxidoreductase activity
## 0.75
## 31723_GO:0016829 lyase activity
## 0.75
## 5133_IPR005133 Na+/H+ antiporter subunit G
## 0.75
## 8141_GO:0006807 nitrogen compound metabolic process
## 0.75
## 11537_IPR011537 NADH ubiquinone oxidoreductase, F subunit
## 0.50
## 11864_IPR011864 Phosphate ABC transporter, permease protein PstC
## 0.50
## 12245_GO:0009058 biosynthetic process
## 0.50
## 13025_GO:0006412 translation
## 0.50
## 15815_GO:0016491 oxidoreductase activity
## 0.50
## 16484_IPR016484 GTP-binding protein EngA
## 0.50
## 17244_IPR017244 Ribosomal RNA large subunit methyltransferase K/L
## 0.50
## 1971_GO:0006412 translation
## 0.50
## 19791_NA
## 0.50
## 2132_GO:0006412 translation
## 0.50
## 218_GO:0006412 translation
## 0.50
## 2381_IPR002381 Ribonuclease PH, bacterial-type
## 0.50
## 27185_IPR017241 Toll-like receptor
## 0.50
## 2781_IPR002781 Transmembrane protein TauE-like
## 0.50
## 28268_IPR028268 Pianissimo family
## 0.50
## 4625_GO:0009058 biosynthetic process
## 0.50
## 4811_GO:0006807 nitrogen compound metabolic process
## 0.50
## 5338_GO:0006807 nitrogen compound metabolic process
## 0.50
## 6130_GO:0006807 nitrogen compound metabolic process
## 0.50
## 7016_NA
## 0.50
## 7325_GO:0006807 nitrogen compound metabolic process
## 0.50
## 10074_GO:0009058 biosynthetic process
## 0.25
## 10108_GO:0009058 biosynthetic process
## 0.25
## 1015_GO:0009058 biosynthetic process
## 0.25
## 10228_GO:0016491 oxidoreductase activity
## 0.25
## 10404_GO:0006807 nitrogen compound metabolic process
## 0.25
## 1062_IPR001062 Transcription antitermination protein, NusG
## 0.25
## 1063_GO:0006412 translation
## 0.25
## 11284_IPR011284 3-oxoacyl-(acyl-carrier-protein) reductase
## 0.25
## 115_GO:0009058 biosynthetic process
## 0.25
## 11701_IPR011701 Major facilitator superfamily
## 0.25
## 11806_IPR011806 Sulphite reductase, dissimilatory-type alpha subunit
## 0.25
## 1182_IPR001182 Probable peptidoglycan glycosyltransferase FtsW/RodA
## 0.25
## 11890_IPR024704 Structural maintenance of chromosomes protein
## 0.25
## 119_IPR000119 Histone-like DNA-binding protein
## 0.25
## 11900_IPR011900 Glutaredoxin, GrxC
## 0.25
## 1209_GO:0006412 translation
## 0.25
## 12098_IPR012098 SRP-independent targeting protein 3
## 0.25
## 12099_GO:0065003 protein-containing complex assembly
## 0.25
## 12147_GO:0016740 transferase activity
## 0.25
## 131_GO:0009058 biosynthetic process
## 0.25
## 13765_IPR013765 DNA recombination and repair protein RecA
## 0.25
## 13954_NA
## 0.25
## 14105_GO:0009058 biosynthetic process
## 0.25
## 1441_GO:0016740 transferase activity
## 0.25
## 14434_IPR014434 Monothiol glutaredoxin
## 0.25
## 15_IPR000015 Outer membrane usher protein
## 0.25
## 16299_GO:0009058 biosynthetic process
## 0.25
## 16932_IPR016932 Uncharacterised conserved protein UCP029669
## 0.25
## 17649_GO:0006807 nitrogen compound metabolic process
## 0.25
## 17666_IPR017666 2-aminoethylphosphonate ABC transport system, ATP-binding component PhnT2
## 0.25
## 17847_IPR017847 Type VI secretion system, RhsGE-associated Vgr family subset
## 0.25
## 19007_IPR019007 WW domain binding protein 11
## 0.25
## 19407_IPR019407 Cytoplasmic tRNA 2-thiolation protein 2
## 0.25
## 1951_IPR001951 Histone H4
## 0.25
## 2033_IPR002033 Sec-independent periplasmic protein translocase TatC
## 0.25
## 20761_IPR020761 Uncharacterised protein family UPF0114, bacteria
## 0.25
## 20921_GO:0009058 biosynthetic process
## 0.25
## 20948_NA
## 0.25
## 21120_GO:0016853 isomerase activity
## 0.25
## 2141_IPR002141 Influenza virus nucleoprotein (NP)
## 0.25
## 2150_GO:0006412 translation
## 0.25
## 2196_GO:0006807 nitrogen compound metabolic process
## 0.25
## 22271_NA
## 0.25
## 22941_IPR022941 Signal recognition particle, SRP54 subunit
## 0.25
## 2301_GO:0044281 small molecule metabolic process
## 0.25
## 2302_GO:0044281 small molecule metabolic process
## 0.25
## 2303_GO:0044281 small molecule metabolic process
## 0.25
## 23473_NA
## 0.25
## 24791_GO:0006091 generation of precursor metabolites and energy
## 0.25
## 2504_GO:0009058 biosynthetic process
## 0.25
## 2549_NA
## 0.25
## 25703_GO:0006807 nitrogen compound metabolic process
## 0.25
## 26030_IPR001248 Purine-cytosine permease
## 0.25
## 27078_IPR027078 Small nuclear ribonucleoprotein E
## 0.25
## 2755_GO:0009058 biosynthetic process
## 0.25
## 2842_IPR002842 V-type ATPase subunit E
## 0.25
## 28927_NA
## 0.25
## 2975_IPR001019 Guanine nucleotide binding protein (G-protein), alpha subunit
## 0.25
## 29751_GO:0006412 translation
## 0.25
## 2994_IPR002994 Surfeit locus 1/Shy1
## 0.25
## 30559_IPR030559 DNA polymerase zeta catalytic subunit
## 0.25
## 31463_IPR031463 MICOS complex subunit Mic12
## 0.25
## 3170_GO:0016491 oxidoreductase activity
## 0.25
## 3329_NA
## 0.25
## 3448_GO:0009058 biosynthetic process
## 0.25
## 3544_IPR003544 Cytochrome c-type biogenesis protein CcmB
## 0.25
## 3669_GO:0009058 biosynthetic process
## 0.25
## 3673_GO:0016740 transferase activity
## 0.25
## 3724_GO:0009058 biosynthetic process
## 0.25
## 3752_IPR003752 Disulphide bond formation protein DsbB/BdbC
## 0.25
## 3758_GO:0009058 biosynthetic process
## 0.25
## 3764_IPR003764 N-acetylglucosamine-6-phosphate deacetylase
## 0.25
## 3837_IPR003837 Glu-tRNAGln amidotransferase C subunit
## 0.25
## 394_IPR000394 RNA polymerase sigma factor 54
## 0.25
## 4373_IPR004373 Peptide chain release factor 1
## 0.25
## 4506_IPR004506 tRNA-specific 2-thiouridylase
## 0.25
## 4528_IPR004528 3-deoxy-D-manno-octulosonate cytidylyltransferase
## 0.25
## 453_GO:0009058 biosynthetic process
## 0.25
## 4536_IPR004536 Selenophosphate synthetase
## 0.25
## 4569_GO:0009058 biosynthetic process
## 0.25
## 4607_GO:0009058 biosynthetic process
## 0.25
## 4695_IPR004695 Transporter protein SLAC1/Mae1/ Ssu1/TehA
## 0.25
## 4769_IPR004769 Adenylosuccinate lyase
## 0.25
## 4792_NA
## 0.25
## 4835_GO:0016740 transferase activity
## 0.25
## 4903_IPR004903 Lactobacillus surface layer protein
## 0.25
## 4923_IPR004923 Iron permease FTR1/Fip1/EfeU
## 0.25
## 5128_GO:0009058 biosynthetic process
## 0.25
## 5255_IPR005255 PdxA family
## 0.25
## 529_GO:0006412 translation
## 0.25
## 5650_IPR005650 BlaI transcriptional regulatory family
## 0.25
## 5670_IPR005670 Phosphate transport system permease protein 1
## 0.25
## 5704_GO:0006412 translation
## 0.25
## 5759_IPR005759 Endonuclease III
## 0.25
## 577_GO:0005975 carbohydrate metabolic process
## 0.25
## 5813_GO:0006412 translation
## 0.25
## 5840_IPR005839 Methylthiotransferase
## 0.25
## 5930_GO:0009058 biosynthetic process
## 0.25
## 5967_IPR005948 Thiamine/thiamin pyrophosphate-binding periplasmic protein, ABC transporter
## 0.25
## 597_GO:0006412 translation
## 0.25
## 5982_IPR005982 Thioredoxin reductase
## 0.25
## 5996_GO:0006412 translation
## 0.25
## 6032_GO:0006412 translation
## 0.25
## 6035_GO:0046872 metal ion binding
## 0.25
## 6298_IPR006298 GTP-binding protein TypA
## 0.25
## 630_GO:0006412 translation
## 0.25
## 639_IPR000639 Epoxide hydrolase-like
## 0.25
## 653_NA
## 0.25
## 682_GO:0006807 nitrogen compound metabolic process
## 0.25
## 7225_IPR007225 Exocyst complex component EXOC6/Sec15
## 0.25
## 7269_GO:0006807 nitrogen compound metabolic process
## 0.25
## 7305_IPR007305 Vesicle transport protein, Got1/SFT2-like
## 0.25
## 7315_GO:0009058 biosynthetic process
## 0.25
## 7375_NA
## 0.25
## 7466_GO:0009058 biosynthetic process
## 0.25
## 748_IPR000748 Pseudouridine synthase, RsuA/RluB/E/F
## 0.25
## 7533_GO:0046872 metal ion binding
## 0.25
## 7721_GO:0005975 carbohydrate metabolic process
## 0.25
## 7801_NA
## 0.25
## 7812_IPR007812 Type II secretion system protein GspL
## 0.25
## 92_GO:0009058 biosynthetic process
## 0.25
## 926_GO:0009058 biosynthetic process
## 0.25
## 93_IPR000093 DNA recombination protein RecR
## 0.25
## 9311_IPR009311 Interferon alpha-inducible protein IFI6/IFI27-like
## 0.25
## 968_IPR000968 Influenza nuclear export protein NS2
## 0.25
## 9734_NA
## 0.25
Plotting MINT sPLS stability scores together with the heatmap, for GO terms:
# Row names in this object (mint.spls2.WQ.taxa.mat.cor) is how indicator taxa are ordered in the heatmap, Use this when setting the level in the aes() of ggplot2
# This is the table with Stability scores on dimension 1
MINT_sPLS_dim1_stability_GOs <- as.data.frame(table(list.selected.GOs)/4) %>%
separate(col = "list.selected.GOs", # I am splitting this column
sep = "_", # This is the separator
into = c("OTU", "GOs")
)
# Removing the taxa column - not needed:
MINT_sPLS_dim1_stability_GOs$GOs <- NULL
# getting names for GOs
GOs_biplot_colnames <- left_join(otu_table(megan_go_clr_5) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_go_clr_5) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
unite(Function, c(Rank3, Rank4, Rank5, Rank6), sep = "; ") # Adding Function info
## Joining, by = "OTU"
GOs_biplot_colnames <- GOs_biplot_colnames %>%
dplyr::select("OTU", "Function")
# Merging:
MINT_sPLS_dim1_stability_GOs <- left_join(MINT_sPLS_dim1_stability_GOs,
GOs_biplot_colnames)
# Re-running MINT - I need GOs only as names (not GOs and actual annotations)
MINT_sPLS_similarity_scores_and_LOGOCV_GOs <- mint.spls(X = GOs_biplot,
Y = sample_data(megan_go_clr_5)[,24:40],
ncomp = 2,
study = sample_data(megan_go_clr_5)$Sampling_trip,
keepX = keepX, # 50 taxa on dims 1 and 2
mode = "regression")
# I am just making this object (cim_mint.spls2.WQ.taxa.OTUs) to merge with the stability scores based on OTU IDs
MINT_sPLS_similarity_scores_and_LOGOCV_GOs <- cim(MINT_sPLS_similarity_scores_and_LOGOCV_GOs,
comp = 1:2,
xlab = "WQ parameters",
ylab = "Indicator microbial GO terms (genes/functions)",
margins = c(19, # bottom
50), # right
row.names = MINT_sPLS_GOs_ind_names_cim,
symkey = FALSE,
keysize = c(1, 0.4),
title = "MINT sPLS GOs/WQ (PCs 1 and 2)")
# Extracting the correlation matrix from the MINT sPLS heatmap
mint.spls2.WQ.GOs.mat.cor <- MINT_sPLS_similarity_scores_and_LOGOCV_GOs$mat
# Merging them!
MINT_sPLS_dim1_stability_merged_GOs <- left_join(as.data.frame(mint.spls2.WQ.GOs.mat.cor) %>% rownames_to_column("OTU"),
MINT_sPLS_dim1_stability_GOs) # %>%
# filter(if_any(everything(), ~ !is.na(Freq))) # removing those that have NAs as stability scores
# Barplots
MINT_sPLS_stability_plots_ordered.dim1_GOs <- MINT_sPLS_dim1_stability_merged_GOs[,c(1, 19, 20)] %>% # Selecting OTU (1), Freq (19), and taxonomy (20)
ggplot(aes(y = factor(OTU, level = unique(row.names(as.data.frame(mint.spls2.WQ.GOs.mat.cor)))),
x = Freq,
fill = Freq > 0.25)) + # from Largest to smallest Stability
geom_bar(stat = "identity") +
scale_fill_manual(values = c("FALSE" = "grey40", "TRUE" = "seagreen3"), guide = FALSE) +
# scale_y_discrete(limits=rev) + # Reversing the order to match the heatmap
labs(y = 'MINT sPLS Indicator GO terms',
x = "LOGOCV Stability score - dim 1",
title = 'MINT sPLS Are these signals shared across trips?',
subtitle = 'Leave One Group Out Cross Validation (LOGOCV)') +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12), legend.position = "NONE")
MINT_sPLS_stability_plots_ordered.dim1_GOs
# Now exporting this for RawGraphs, this can go in Supplementary Material
RawGraphs_shared_GOs_MINT <- MINT_sPLS_stability_plots_ordered.dim1_GOs$data %>%
as.data.frame() %>%
left_join(.,
tax_table(megan_go_clr_5) %>%
as.data.frame %>%
rownames_to_column("OTU")) %>%
# unite(full_taxonomy, c(Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") %>% # Adding Taxonomy info
dplyr::filter(if_any(everything(), ~ !is.na(Freq)))
# Exporting as csv
write.csv(RawGraphs_shared_taxa_MINT, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/RawGraphs_shared_GOs_MINT.csv", quote = F, row.names = F)
Numerical values - only for stable indicators (this will help me with text writing):
# First extracting similarity values from the MINT sPLS heatmap
MINT_sPLS_mat.corr <- left_join(cim_mint.spls2.WQ.taxa.OTUs[["mat"]] %>%
as.data.frame() %>%
rownames_to_column("OTU"),
megan_genus_clr@tax_table %>% # Adding taxonomy info too
as.data.frame() %>%
rownames_to_column("OTU")
)
# Now adding the stability scores!
MINT_sPLS_mat.corr_and_LOGOCV <- left_join(MINT_sPLS_mat.corr, RawGraphs_shared_taxa_MINT[,1:2])
# Visualising as a table
knitr::kable(MINT_sPLS_mat.corr_and_LOGOCV, caption = "MINT sPLS - numerical representation of similarity scores (partial correlations).")
| OTU | median_POC_µM | median_PN_µM | median_Chlorophyll_A_µg_L | median_PP_µM | SALINITY_2.5m_RV | median_DOC_µM | median_Phaeophytin_A_µg_L | median_TDN_µM | FLUORESCENCE_2.5m_RV | median_Si_µM | median_NO2_µM | median_TSS_mg_L | SEAWATER_TEMPERATURE_2.5m_RV | median_NH4_µM | median_NO3_µM | median_TDP_µM | median_PO4_µM | Domain | Phylum | Class | Order | Family | Genus | Species | Freq |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 135619_Unknown Family; Unknown Genus | 0.4033428 | 0.3610483 | 0.3550518 | 0.2666964 | 0.2069694 | 0.3203748 | 0.0669948 | -0.1305209 | -0.0820813 | -0.1154535 | -0.3702580 | -0.2279649 | -0.1782364 | -0.2482898 | -0.5055078 | -0.2462484 | -0.4499928 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1890424_Unknown Family; Unknown Genus | 0.4000287 | 0.3577860 | 0.3504673 | 0.2632587 | 0.2043906 | 0.3202456 | 0.0627213 | -0.1338292 | -0.0834813 | -0.1151821 | -0.3719716 | -0.2296177 | -0.1794068 | -0.2488255 | -0.5050225 | -0.2423465 | -0.4467868 | NA | NA | NA | NA | NA | NA | NA | NA |
| 28221_Unknown Family; Unknown Genus | 0.4056784 | 0.3629686 | 0.3561470 | 0.2675225 | 0.2076618 | 0.3236726 | 0.0652372 | -0.1338014 | -0.0837521 | -0.1165123 | -0.3751429 | -0.2313170 | -0.1807870 | -0.2512120 | -0.5105491 | -0.2465917 | -0.4528818 | NA | NA | NA | NA | NA | NA | NA | NA |
| 31989_Rhodobacteraceae; Unknown Genus | 0.4055754 | 0.3626928 | 0.3550215 | 0.2666807 | 0.2070638 | 0.3251446 | 0.0629091 | -0.1364873 | -0.0850189 | -0.1169032 | -0.3780005 | -0.2334474 | -0.1823771 | -0.2527475 | -0.5126970 | -0.2453627 | -0.4530719 | NA | NA | NA | NA | NA | NA | NA | NA |
| 119060_Burkholderiaceae; Unknown Genus | 0.4057722 | 0.3627214 | 0.3543630 | 0.2661891 | 0.2067266 | 0.3265495 | 0.0610846 | -0.1387361 | -0.0860937 | -0.1172973 | -0.3805535 | -0.2353174 | -0.1837784 | -0.2541534 | -0.5147733 | -0.2445457 | -0.4535365 | NA | NA | NA | NA | NA | NA | NA | NA |
| 186802_Unknown Family; Unknown Genus | 0.4216702 | 0.3778952 | 0.3736732 | 0.2806748 | 0.2176846 | 0.3311964 | 0.0755955 | -0.1299136 | -0.0827148 | -0.1196889 | -0.3799840 | -0.2330611 | -0.1824027 | -0.2557276 | -0.5230028 | -0.2602413 | -0.4697065 | NA | NA | NA | NA | NA | NA | NA | NA |
| 561_Enterobacteriaceae; Escherichia | 0.4167212 | 0.3727744 | 0.3654223 | 0.2744912 | 0.2130936 | 0.3331127 | 0.0660767 | -0.1385452 | -0.0865536 | -0.1198542 | -0.3865506 | -0.2385003 | -0.1863708 | -0.2586979 | -0.5253690 | -0.2528316 | -0.4653330 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1150_Unknown Family; Unknown Genus | 0.4134463 | 0.3710803 | 0.3695158 | 0.2775409 | 0.2150881 | 0.3200371 | 0.0811115 | -0.1191547 | -0.0772065 | -0.1160831 | -0.3636434 | -0.2218955 | -0.1738979 | -0.2459040 | -0.5059153 | -0.2586931 | -0.4596232 | NA | NA | NA | NA | NA | NA | NA | NA |
| 976_Unknown Family; Unknown Genus | 0.4066624 | 0.3646181 | 0.3613471 | 0.2714129 | 0.2104497 | 0.3179472 | 0.0750787 | -0.1227321 | -0.0785596 | -0.1150336 | -0.3636830 | -0.2227075 | -0.1743723 | -0.2451221 | -0.5022467 | -0.2520759 | -0.4527022 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1161_Unknown Family; Unknown Genus | 0.3964776 | 0.3552035 | 0.3507030 | 0.2634237 | 0.2043393 | 0.3123777 | 0.0696384 | -0.1238471 | -0.0785758 | -0.1128002 | -0.3591222 | -0.2205013 | -0.1725247 | -0.2414459 | -0.4931755 | -0.2439663 | -0.4418341 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1236_Unknown Family; Unknown Genus | 0.3880844 | 0.3460719 | 0.3341892 | 0.2510522 | 0.1952253 | 0.3194122 | 0.0478661 | -0.1451089 | -0.0882226 | -0.1141043 | -0.3774493 | -0.2350566 | -0.1832382 | -0.2503776 | -0.5027344 | -0.2285596 | -0.4351598 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1263978_Rhodospirillaceae; Candidatus Endolissoclinum | 0.3852173 | 0.3430686 | 0.3292031 | 0.2473157 | 0.1924570 | 0.3208316 | 0.0418915 | -0.1506509 | -0.0907029 | -0.1142837 | -0.3818412 | -0.2386434 | -0.1858627 | -0.2524166 | -0.5045585 | -0.2240347 | -0.4326868 | NA | NA | NA | NA | NA | NA | NA | NA |
| 91347_Unknown Family; Unknown Genus | 0.3918159 | 0.3490571 | 0.3354727 | 0.2520234 | 0.1960858 | 0.3253809 | 0.0440168 | -0.1515751 | -0.0914722 | -0.1159853 | -0.3865838 | -0.2413982 | -0.1880501 | -0.2557666 | -0.5118144 | -0.2285826 | -0.4399128 | NA | NA | NA | NA | NA | NA | NA | NA |
| 82115_Rhizobiaceae; Unknown Genus | 0.3748928 | 0.3339565 | 0.3208460 | 0.2410357 | 0.1875443 | 0.3115329 | 0.0418096 | -0.1453884 | -0.0876919 | -0.1110313 | -0.3702775 | -0.2312616 | -0.1801445 | -0.2449313 | -0.4900099 | -0.2185554 | -0.4209526 | NA | NA | NA | NA | NA | NA | NA | NA |
| 356_Unknown Family; Unknown Genus | 0.4217166 | 0.3767941 | 0.3672718 | 0.2758896 | 0.2143148 | 0.3409058 | 0.0612169 | -0.1468562 | -0.0907404 | -0.1223189 | -0.3984042 | -0.2467119 | -0.1926050 | -0.2657092 | -0.5372354 | -0.2530105 | -0.4716571 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1224_Unknown Family; Unknown Genus | 0.4138764 | 0.3695250 | 0.3589550 | 0.2696476 | 0.2095462 | 0.3368031 | 0.0567544 | -0.1480376 | -0.0909058 | -0.1206495 | -0.3952441 | -0.2452736 | -0.1913770 | -0.2630693 | -0.5305230 | -0.2466293 | -0.4633272 | NA | NA | NA | NA | NA | NA | NA | NA |
| 570_Enterobacteriaceae; Klebsiella | 0.4129173 | 0.3681086 | 0.3549652 | 0.2666619 | 0.2073970 | 0.3407642 | 0.0495706 | -0.1559926 | -0.0946247 | -0.1216527 | -0.4033370 | -0.2513840 | -0.1959247 | -0.2673386 | -0.5362420 | -0.2424993 | -0.4631843 | NA | NA | NA | NA | NA | NA | NA | NA |
| 28216_Unknown Family; Unknown Genus | 0.4047091 | 0.3611206 | 0.3497665 | 0.2627496 | 0.2042527 | 0.3312014 | 0.0527332 | -0.1480110 | -0.0904325 | -0.1184799 | -0.3900205 | -0.2424585 | -0.1890943 | -0.2591546 | -0.5214954 | -0.2397718 | -0.4534294 | NA | NA | NA | NA | NA | NA | NA | NA |
| 41295_Rhodospirillaceae; Unknown Genus | 0.3904162 | 0.3470986 | 0.3302621 | 0.2481238 | 0.1932718 | 0.3302424 | 0.0349000 | -0.1615758 | -0.0961378 | -0.1172006 | -0.3966480 | -0.2490209 | -0.1937194 | -0.2610514 | -0.5188136 | -0.2232452 | -0.4395238 | NA | NA | NA | NA | NA | NA | NA | NA |
| 2_Unknown Family; Unknown Genus | 0.4454294 | 0.4000795 | 0.3997550 | 0.3002475 | 0.2325983 | 0.3423102 | 0.0910808 | -0.1240248 | -0.0811203 | -0.1243910 | -0.3870541 | -0.2355617 | -0.1847353 | -0.2623703 | -0.5414112 | -0.2805692 | -0.4946908 | NA | NA | NA | NA | NA | NA | NA | NA |
| 543_Enterobacteriaceae; Unknown Genus | 0.4331295 | 0.3884009 | 0.3851590 | 0.2892973 | 0.2243015 | 0.3381989 | 0.0806217 | -0.1299473 | -0.0833067 | -0.1224010 | -0.3865141 | -0.2365803 | -0.1852563 | -0.2606213 | -0.5342879 | -0.2688132 | -0.4820790 | NA | NA | NA | NA | NA | NA | NA | NA |
| 135622_Unknown Family; Unknown Genus | 0.4317076 | 0.3916543 | 0.4094259 | 0.3074336 | 0.2370146 | 0.2987556 | 0.1373711 | -0.0624354 | -0.0512646 | -0.1116289 | -0.3124135 | -0.1818086 | -0.1442970 | -0.2203214 | -0.4763597 | -0.2966994 | -0.4729716 | NA | NA | NA | NA | NA | NA | NA | NA |
| 204455_Unknown Family; Unknown Genus | 0.4220560 | 0.3824063 | 0.3974997 | 0.2984876 | 0.2302551 | 0.2962394 | 0.1281081 | -0.0683251 | -0.0535686 | -0.1102595 | -0.3133386 | -0.1836079 | -0.1454532 | -0.2196794 | -0.4718104 | -0.2869418 | -0.4632147 | NA | NA | NA | NA | NA | NA | NA | NA |
| 12916_Comamonadaceae; Acidovorax | 0.4241158 | 0.3852801 | 0.4051201 | 0.3041908 | 0.2343712 | 0.2891565 | 0.1414183 | -0.0537328 | -0.0467618 | -0.1084903 | -0.2986635 | -0.1724907 | -0.1371854 | -0.2119756 | -0.4616149 | -0.2947430 | -0.4638011 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1501348_Amoebophilaceae; Unknown Genus | 0.4204972 | 0.3817519 | 0.4003049 | 0.3005797 | 0.2316558 | 0.2887293 | 0.1371776 | -0.0568443 | -0.0480535 | -0.1081165 | -0.2999911 | -0.1738924 | -0.1381622 | -0.2122661 | -0.4606657 | -0.2906972 | -0.4602444 | NA | NA | NA | NA | NA | NA | NA | NA |
| 265488_Pirellulaceae; Rhodopirellula | 0.3806425 | 0.3435812 | 0.3511540 | 0.2637107 | 0.2037942 | 0.2781944 | 0.0991429 | -0.0809117 | -0.0574476 | -0.1024225 | -0.3035360 | -0.1811183 | -0.1427841 | -0.2094662 | -0.4416681 | -0.2505135 | -0.4199264 | NA | NA | NA | NA | NA | NA | NA | NA |
| 367771_Roseobacteraceae; Marinovum | 0.4086796 | 0.3676470 | 0.3700198 | 0.2779026 | 0.2151181 | 0.3091943 | 0.0908154 | -0.1052626 | -0.0703882 | -0.1128097 | -0.3458602 | -0.2092615 | -0.1643633 | -0.2357085 | -0.4896002 | -0.2610789 | -0.4529200 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1706369_Unknown Family; Unknown Genus | 0.4024044 | 0.3620001 | 0.3643281 | 0.2736279 | 0.2118096 | 0.3044617 | 0.0893984 | -0.1036727 | -0.0693200 | -0.1110816 | -0.3405783 | -0.2060697 | -0.1618555 | -0.2321047 | -0.4821046 | -0.2570587 | -0.4459684 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1117_Unknown Family; Unknown Genus | 0.3939522 | 0.3541367 | 0.3552108 | 0.2667855 | 0.2065891 | 0.3002660 | 0.0842496 | -0.1053441 | -0.0696866 | -0.1093434 | -0.3376033 | -0.2048392 | -0.1607709 | -0.2294926 | -0.4752012 | -0.2500088 | -0.4370329 | NA | NA | NA | NA | NA | NA | NA | NA |
| 80865_Comamonadaceae; Delftia | 0.4115579 | 0.3715627 | 0.3801038 | 0.2854500 | 0.2205724 | 0.3001444 | 0.1081540 | -0.0863551 | -0.0615791 | -0.1105668 | -0.3269641 | -0.1949206 | -0.1537023 | -0.2258156 | -0.4765953 | -0.2713438 | -0.4539058 | NA | NA | NA | NA | NA | NA | NA | NA |
| 265976_Ornithinimicrobiaceae; Serinicoccus | 0.4091966 | 0.3693561 | 0.3775013 | 0.2834971 | 0.2190847 | 0.2990554 | 0.1065919 | -0.0869676 | -0.0617505 | -0.1101037 | -0.3262911 | -0.1946940 | -0.1534869 | -0.2251714 | -0.4747886 | -0.2693119 | -0.4514259 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1406885_Alteromonadaceae; Aliiglaciecola | 0.3582115 | 0.3215356 | 0.3203201 | 0.2405897 | 0.1864428 | 0.2770259 | 0.0706554 | -0.1027889 | -0.0666803 | -0.1005057 | -0.3145765 | -0.1918912 | -0.1503968 | -0.2127892 | -0.4379525 | -0.2243245 | -0.3981692 | NA | NA | NA | NA | NA | NA | NA | NA |
| 165697_Sphingomonadaceae; Sphingopyxis | 0.3589377 | 0.3219899 | 0.3198557 | 0.2402449 | 0.1862341 | 0.2792596 | 0.0683115 | -0.1059237 | -0.0682013 | -0.1011619 | -0.3183913 | -0.1946356 | -0.1524619 | -0.2149412 | -0.4412908 | -0.2235243 | -0.3993046 | NA | NA | NA | NA | NA | NA | NA | NA |
| 167375_Prochlorococcaceae; Cyanobium | 0.3434407 | 0.3071863 | 0.3009616 | 0.2260715 | 0.1755151 | 0.2748366 | 0.0540081 | -0.1147102 | -0.0715833 | -0.0988594 | -0.3191491 | -0.1969851 | -0.1539152 | -0.2135164 | -0.4334251 | -0.2081446 | -0.3835633 | NA | NA | NA | NA | NA | NA | NA | NA |
| 183963_Unknown Family; Unknown Genus | 0.3341531 | 0.2987922 | 0.2923328 | 0.2195917 | 0.1705106 | 0.2681398 | 0.0514536 | -0.1128954 | -0.0702571 | -0.0963850 | -0.3119159 | -0.1926942 | -0.1505272 | -0.2084992 | -0.4227819 | -0.2019637 | -0.3733351 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1890426_Synechococcaceae; Unknown Genus | 0.3721298 | 0.3338817 | 0.3319364 | 0.2493176 | 0.1932500 | 0.2890355 | 0.0715478 | -0.1089630 | -0.0703036 | -0.1047479 | -0.3291664 | -0.2011019 | -0.1575519 | -0.2223391 | -0.4567947 | -0.2321056 | -0.4138846 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1680826_Candidatus Thalassarchaeaceae; Candidatus Thalassarchaeum | 0.3640554 | 0.3255715 | 0.3187260 | 0.2394166 | 0.1858919 | 0.2917846 | 0.0565787 | -0.1223854 | -0.0762540 | -0.1049155 | -0.3391631 | -0.2094446 | -0.1636288 | -0.2267969 | -0.4601023 | -0.2202996 | -0.4066750 | NA | NA | NA | NA | NA | NA | NA | NA |
| 213422_Geobacteraceae; Unknown Genus | 0.3056509 | 0.2739048 | 0.2707727 | 0.2033842 | 0.1577445 | 0.2402011 | 0.0546014 | -0.0943979 | -0.0600649 | -0.0867928 | -0.2756829 | -0.1691204 | -0.1323537 | -0.1855008 | -0.3792944 | -0.1885397 | -0.3404961 | NA | NA | NA | NA | NA | NA | NA | NA |
| 213421_Desulfuromonadaceae; Unknown Genus | 0.3034515 | 0.2719144 | 0.2687149 | 0.2018389 | 0.1565517 | 0.2386370 | 0.0539641 | -0.0940061 | -0.0597688 | -0.0862127 | -0.2740113 | -0.1681349 | -0.1315743 | -0.1843350 | -0.3768058 | -0.1870598 | -0.3380782 | NA | NA | NA | NA | NA | NA | NA | NA |
| 204441_Unknown Family; Unknown Genus | 0.2685214 | 0.2344867 | 0.2032343 | 0.1527781 | 0.1203314 | 0.2630399 | -0.0293990 | -0.1739650 | -0.0958783 | -0.0903219 | -0.3410260 | -0.2218470 | -0.1710320 | -0.2164924 | -0.4094469 | -0.1265977 | -0.3093450 | NA | NA | NA | NA | NA | NA | NA | NA |
| 85006_Unknown Family; Unknown Genus | 0.2562314 | 0.2238913 | 0.1947040 | 0.1463623 | 0.1152303 | 0.2498423 | -0.0263305 | -0.1639755 | -0.0905300 | -0.0858746 | -0.3232165 | -0.2100616 | -0.1619846 | -0.2053917 | -0.3890093 | -0.1216728 | -0.2949590 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1655514_Pelagibacteraceae; Unknown Genus | 0.2281492 | 0.1985453 | 0.1688089 | 0.1269156 | 0.1002014 | 0.2293008 | -0.0336188 | -0.1579755 | -0.0862772 | -0.0783135 | -0.3007896 | -0.1966748 | -0.1514322 | -0.1899201 | -0.3563992 | -0.1032041 | -0.2639752 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1706372_Halieaceae; Unknown Genus | 0.3165124 | 0.2781703 | 0.2495666 | 0.1875658 | 0.1471102 | 0.2950226 | -0.0123008 | -0.1787556 | -0.1005589 | -0.1023988 | -0.3734213 | -0.2403274 | -0.1857796 | -0.2397203 | -0.4606013 | -0.1605025 | -0.3616819 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1239_Unknown Family; Unknown Genus | 0.3168624 | 0.2779682 | 0.2469691 | 0.1856252 | 0.1457592 | 0.2996631 | -0.0187311 | -0.1865035 | -0.1042456 | -0.1036792 | -0.3820311 | -0.2466700 | -0.1905264 | -0.2444247 | -0.4674328 | -0.1574421 | -0.3629287 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1341118_Halieaceae; Luminiphilus | 0.4382632 | 0.3996907 | 0.4274216 | 0.3209071 | 0.2468184 | 0.2856079 | 0.1657596 | -0.0324347 | -0.0373871 | -0.1085399 | -0.2835578 | -0.1596597 | -0.1278729 | -0.2054700 | -0.4576783 | -0.3144768 | -0.4766823 | NA | NA | NA | NA | NA | NA | NA | NA |
| 574899_Verrucomicrobiaceae; Haloferula | 0.4300113 | 0.3917026 | 0.4167657 | 0.3129152 | 0.2407973 | 0.2841463 | 0.1568143 | -0.0386771 | -0.0399285 | -0.1075556 | -0.2856590 | -0.1621694 | -0.1295873 | -0.2056308 | -0.4547994 | -0.3056169 | -0.4684758 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1803399_Unknown Family; Candidatus Peribacter | 0.4032001 | 0.3686355 | 0.3984233 | 0.2991183 | 0.2298097 | 0.2549547 | 0.1641042 | -0.0161835 | -0.0279289 | -0.0977452 | -0.2460459 | -0.1358947 | -0.1094284 | -0.1810020 | -0.4096268 | -0.2951736 | -0.4370138 | NA | NA | NA | NA | NA | NA | NA | NA |
| 455358_Balneolaceae; Balneola | 0.4922776 | 0.4517483 | 0.4958695 | 0.3722464 | 0.2855451 | 0.2971315 | 0.2214039 | 0.0050034 | -0.0223728 | -0.1155118 | -0.2735204 | -0.1459869 | -0.1187097 | -0.2064305 | -0.4793893 | -0.3710042 | -0.5307840 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1931200_Rhodobacteraceae; Marinibacterium | 0.4423731 | 0.4098223 | 0.4674196 | 0.3508208 | 0.2680919 | 0.2342509 | 0.2476827 | 0.0618267 | 0.0070444 | -0.0949395 | -0.1835511 | -0.0850440 | -0.0721918 | -0.1517950 | -0.3827854 | -0.3579791 | -0.4705452 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1813606_Balneolaceae; Unknown Genus | 0.3422029 | 0.3157030 | 0.3541367 | 0.2658188 | 0.2034657 | 0.1923799 | 0.1749808 | 0.0282743 | -0.0038099 | -0.0764640 | -0.1632154 | -0.0815239 | -0.0676054 | -0.1289191 | -0.3124805 | -0.2685341 | -0.3661889 | NA | NA | NA | NA | NA | NA | NA | NA |
| 2146_Acholeplasmataceae; Unknown Genus | 0.5356608 | 0.5002872 | 0.5887749 | 0.4418354 | 0.3366302 | 0.2494391 | 0.3507968 | 0.1347356 | 0.0368822 | -0.1057053 | -0.1572591 | -0.0547897 | -0.0514039 | -0.1486032 | -0.4133741 | -0.4591447 | -0.5630583 | NA | NA | NA | NA | NA | NA | NA | NA |
| 544448_Unknown Family; Unknown Genus | 0.5329406 | 0.4976408 | 0.5851881 | 0.4391455 | 0.3346064 | 0.2490685 | 0.3476826 | 0.1324831 | 0.0359522 | -0.1054109 | -0.1581631 | -0.0557737 | -0.0520862 | -0.1487706 | -0.4125881 | -0.4561406 | -0.5603749 | NA | NA | NA | NA | NA | NA | NA | NA |
| 31969_Unknown Family; Unknown Genus | 0.5406497 | 0.5059225 | 0.5997610 | 0.4500639 | 0.3426641 | 0.2435006 | 0.3663517 | 0.1504488 | 0.0440725 | -0.1044547 | -0.1430269 | -0.0436629 | -0.0431861 | -0.1414860 | -0.4051173 | -0.4696212 | -0.5666805 | NA | NA | NA | NA | NA | NA | NA | NA |
| 662_Vibrionaceae; Vibrio | 0.1993392 | 0.1707143 | 0.1319348 | 0.0992590 | 0.0793528 | 0.2237036 | -0.0641149 | -0.1789052 | -0.0947406 | -0.0747434 | -0.3071867 | -0.2047410 | -0.1568977 | -0.1899729 | -0.3456238 | -0.0726416 | -0.2352265 | NA | NA | NA | NA | NA | NA | NA | NA |
| 335929_Erythrobacteraceae; Unknown Genus | 0.1810788 | 0.1538317 | 0.1128332 | 0.0849218 | 0.0683880 | 0.2137447 | -0.0739085 | -0.1809509 | -0.0947916 | -0.0707462 | -0.2990603 | -0.2008230 | -0.1536131 | -0.1834093 | -0.3293991 | -0.0580820 | -0.2157464 | NA | NA | NA | NA | NA | NA | NA | NA |
| 69277_Phyllobacteriaceae; Unknown Genus | 0.1779742 | 0.1496224 | 0.1020364 | 0.0768408 | 0.0625469 | 0.2233862 | -0.0924319 | -0.2011350 | -0.1041938 | -0.0731330 | -0.3192141 | -0.2161225 | -0.1649860 | -0.1939566 | -0.3432506 | -0.0471001 | -0.2146595 | NA | NA | NA | NA | NA | NA | NA | NA |
| 2742_Marinobacteraceae; Marinobacter | 0.1331937 | 0.1081237 | 0.0546461 | 0.0412723 | 0.0353694 | 0.1997851 | -0.1176703 | -0.2075889 | -0.1049994 | -0.0635527 | -0.3008457 | -0.2076709 | -0.1577957 | -0.1787056 | -0.3046656 | -0.0107789 | -0.1670491 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1213_Prochloraceae; Unknown Genus | 0.1529571 | 0.1228809 | 0.0555024 | 0.0419750 | 0.0367972 | 0.2403176 | -0.1513249 | -0.2574464 | -0.1296031 | -0.0759283 | -0.3661729 | -0.2538223 | -0.1926711 | -0.2164260 | -0.3658281 | -0.0042066 | -0.1939733 | NA | NA | NA | NA | NA | NA | NA | NA |
| 72275_Alteromonadaceae; Unknown Genus | 0.0762656 | 0.0557564 | -0.0034086 | -0.0023067 | 0.0019737 | 0.1664918 | -0.1448620 | -0.2100361 | -0.1032970 | -0.0504835 | -0.2712438 | -0.1922926 | -0.1451918 | -0.1559321 | -0.2507926 | 0.0329262 | -0.1058776 | NA | NA | NA | NA | NA | NA | NA | NA |
| 72274_Unknown Family; Unknown Genus | 0.0438098 | 0.0246905 | -0.0433304 | -0.0322531 | -0.0206605 | 0.1577561 | -0.1756029 | -0.2293615 | -0.1108174 | -0.0458042 | -0.2738332 | -0.1979566 | -0.1487908 | -0.1534910 | -0.2350925 | 0.0655324 | -0.0730138 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1218_Prochlorococcaceae; Prochlorococcus | 0.0600530 | 0.0373776 | -0.0394784 | -0.0293222 | -0.0178286 | 0.1863425 | -0.1962329 | -0.2620665 | -0.1271215 | -0.0546968 | -0.3185440 | -0.2292298 | -0.1724789 | -0.1796291 | -0.2784344 | 0.0673867 | -0.0942146 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1839_Nocardioidaceae; Nocardioides | 0.0940781 | 0.0785494 | 0.0508821 | 0.0383348 | 0.0314534 | 0.1226697 | -0.0556817 | -0.1143478 | -0.0588785 | -0.0398993 | -0.1774526 | -0.1207040 | -0.0920405 | -0.1072460 | -0.1881654 | -0.0214552 | -0.1143705 | NA | NA | NA | NA | NA | NA | NA | NA |
| 135623_Unknown Family; Unknown Genus | 0.0770208 | 0.0592405 | 0.0130884 | 0.0100280 | 0.0107013 | 0.1433211 | -0.1093818 | -0.1686805 | -0.0837508 | -0.0442690 | -0.2267736 | -0.1592368 | -0.1205035 | -0.1319374 | -0.2169051 | 0.0146255 | -0.1020539 | NA | NA | NA | NA | NA | NA | NA | NA |
| 53246_Pseudoalteromonadaceae; Pseudoalteromonas | 0.0420457 | 0.0285680 | -0.0141183 | -0.0104211 | -0.0053593 | 0.1101638 | -0.1071944 | -0.1479532 | -0.0721774 | -0.0328031 | -0.1844520 | -0.1318959 | -0.0993884 | -0.1048749 | -0.1651920 | 0.0319434 | -0.0619782 | NA | NA | NA | NA | NA | NA | NA | NA |
| 49_Polyangiaceae; Unknown Genus | 0.0349249 | 0.0220151 | -0.0213946 | -0.0158831 | -0.0095443 | 0.1060214 | -0.1106284 | -0.1482978 | -0.0719827 | -0.0311743 | -0.1807911 | -0.1300033 | -0.0978350 | -0.1020490 | -0.1584855 | 0.0374267 | -0.0543309 | NA | NA | NA | NA | NA | NA | NA | NA |
| 390876_Thalassobaculaceae; Nisaea | -0.0029652 | -0.0147777 | -0.0709625 | -0.0530576 | -0.0375284 | 0.1002695 | -0.1531299 | -0.1786407 | -0.0844473 | -0.0269144 | -0.1922624 | -0.1428790 | -0.1067174 | -0.1037745 | -0.1466727 | 0.0788295 | -0.0168374 | NA | NA | NA | NA | NA | NA | NA | NA |
| 2745_Halomonadaceae; Halomonas | 0.0020200 | -0.0124173 | -0.0784259 | -0.0586211 | -0.0412136 | 0.1220236 | -0.1787679 | -0.2113951 | -0.1002090 | -0.0331553 | -0.2306474 | -0.1707612 | -0.1276514 | -0.1251535 | -0.1789969 | 0.0891404 | -0.0258922 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1041_Erythrobacteraceae; Erythrobacter | 0.0199626 | 0.0105310 | -0.0238011 | -0.0177294 | -0.0115514 | 0.0779751 | -0.0890757 | -0.1151719 | -0.0555437 | -0.0225192 | -0.1363493 | -0.0987818 | -0.0742106 | -0.0762082 | -0.1160496 | 0.0344322 | -0.0344656 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1313115_Thalassospiraceae; Magnetovibrio | -0.0720720 | -0.0766999 | -0.1321445 | -0.0990130 | -0.0731734 | 0.0459022 | -0.1653881 | -0.1571952 | -0.0708183 | -0.0072750 | -0.1298201 | -0.1045589 | -0.0767310 | -0.0617731 | -0.0608296 | 0.1214139 | 0.0601593 | NA | NA | NA | NA | NA | NA | NA | NA |
| 766_Unknown Family; Unknown Genus | -0.0828928 | -0.0862681 | -0.1410048 | -0.1056708 | -0.0783757 | 0.0363089 | -0.1657004 | -0.1519463 | -0.0677889 | -0.0039075 | -0.1179899 | -0.0970368 | -0.0708983 | -0.0540847 | -0.0458048 | 0.1272709 | 0.0724277 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1868329_Flavobacteriaceae; Xanthomarina | 0.2905098 | 0.2814517 | 0.3764087 | 0.2823047 | 0.2126434 | 0.0495605 | 0.3177452 | 0.2230882 | 0.0910436 | -0.0341382 | 0.0775781 | 0.0910287 | 0.0623546 | 0.0076118 | -0.0985725 | -0.3133444 | -0.2885414 | NA | NA | NA | NA | NA | NA | NA | NA |
| 261827_Flavobacteriaceae; Algibacter | 0.2895329 | 0.2804477 | 0.3748187 | 0.2811130 | 0.2117575 | 0.0498806 | 0.3159528 | 0.2214862 | 0.0903340 | -0.0341550 | 0.0763925 | 0.0900370 | 0.0616326 | 0.0070854 | -0.0989543 | -0.3119254 | -0.2876667 | NA | NA | NA | NA | NA | NA | NA | NA |
| 561367_Flavobacteriaceae; Salinimicrobium | 0.2796548 | 0.2707874 | 0.3615115 | 0.2711339 | 0.2042593 | 0.0489586 | 0.3040134 | 0.2125649 | 0.0866057 | -0.0332007 | 0.0723044 | 0.0858666 | 0.0587089 | 0.0060411 | -0.0967211 | -0.3006981 | -0.2780053 | NA | NA | NA | NA | NA | NA | NA | NA |
| 153265_Flavobacteriaceae; Aequorivita | 0.2701101 | 0.2624642 | 0.3543531 | 0.2657524 | 0.2000167 | 0.0395103 | 0.3052049 | 0.2189209 | 0.0900954 | -0.0299636 | 0.0846134 | 0.0938910 | 0.0648920 | 0.0138378 | -0.0820229 | -0.2962720 | -0.2669902 | NA | NA | NA | NA | NA | NA | NA | NA |
| 574559_Robiginitomaculaceae; Hellea | 0.2988090 | 0.2926727 | 0.4050943 | 0.3037749 | 0.2281646 | 0.0240523 | 0.3668675 | 0.2765801 | 0.1159579 | -0.0278296 | 0.1309492 | 0.1315537 | 0.0924774 | 0.0355338 | -0.0619334 | -0.3425022 | -0.2914991 | NA | NA | NA | NA | NA | NA | NA | NA |
| 225842_Flavobacteriaceae; Formosa | 0.3645983 | 0.3511518 | 0.4606864 | 0.3455412 | 0.2607011 | 0.0797928 | 0.3726130 | 0.2491934 | 0.0996821 | -0.0476038 | 0.0639367 | 0.0894626 | 0.0597377 | -0.0085500 | -0.1494926 | -0.3800531 | -0.3655815 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1608457_Rhodobacteraceae; Aestuariivita | 0.3699578 | 0.3531503 | 0.4496229 | 0.3372876 | 0.2551379 | 0.1077441 | 0.3382620 | 0.2059926 | 0.0789547 | -0.0555480 | 0.0139983 | 0.0530583 | 0.0324277 | -0.0362303 | -0.1909318 | -0.3655428 | -0.3762121 | NA | NA | NA | NA | NA | NA | NA | NA |
| 117747_Unknown Family; Unknown Genus | 0.2375288 | 0.2252696 | 0.2804004 | 0.2103658 | 0.1594494 | 0.0816032 | 0.1986959 | 0.1105083 | 0.0403935 | -0.0390260 | -0.0146231 | 0.0165616 | 0.0077389 | -0.0360485 | -0.1407970 | -0.2253680 | -0.2439838 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1123967_Porticoccaceae; Porticoccus | 0.1820252 | 0.1707939 | 0.2045237 | 0.1534683 | 0.1167358 | 0.0780822 | 0.1291423 | 0.0574769 | 0.0180698 | -0.0341128 | -0.0407456 | -0.0092079 | -0.0104353 | -0.0436230 | -0.1306802 | -0.1610378 | -0.1900237 | NA | NA | NA | NA | NA | NA | NA | NA |
| 489140_Geminicoccaceae; Geminicoccus | 0.0641967 | 0.0691734 | 0.1225230 | 0.0917956 | 0.0677158 | -0.0481202 | 0.1580750 | 0.1526779 | 0.0690750 | 0.0084370 | 0.1293784 | 0.1033229 | 0.0759611 | 0.0624666 | 0.0647831 | -0.1135758 | -0.0521658 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1649495_Flavobacteriaceae; Seonamhaeicola | 0.0378089 | 0.0451591 | 0.0970754 | 0.0726886 | 0.0530063 | -0.0657482 | 0.1487365 | 0.1553861 | 0.0716837 | 0.0150889 | 0.1472714 | 0.1135439 | 0.0841146 | 0.0752820 | 0.0929723 | -0.0949651 | -0.0233800 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1738655_Woeseiaceae; Woeseia | 0.0157185 | 0.0224251 | 0.0609415 | 0.0456067 | 0.0328798 | -0.0582387 | 0.1078008 | 0.1186850 | 0.0554140 | 0.0146337 | 0.1199443 | 0.0907360 | 0.0675013 | 0.0630981 | 0.0839406 | -0.0626742 | -0.0036532 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1284657_Rhodobacteraceae; Planktomarina | 0.0152383 | 0.0244924 | 0.0745977 | 0.0558139 | 0.0400499 | -0.0797582 | 0.1391606 | 0.1558335 | 0.0730300 | 0.0204897 | 0.1605470 | 0.1207821 | 0.0899645 | 0.0851447 | 0.1155188 | -0.0782451 | 0.0010320 | NA | NA | NA | NA | NA | NA | NA | NA |
| 358023_Flavobacteriaceae; Lutibacter | 0.0038996 | 0.0136192 | 0.0605374 | 0.0452672 | 0.0320828 | -0.0826401 | 0.1281681 | 0.1487844 | 0.0702618 | 0.0220784 | 0.1593192 | 0.1185638 | 0.0885281 | 0.0858225 | 0.1207546 | -0.0667262 | 0.0124799 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1579505_Pirellulaceae; Rubripirellula | 0.1082494 | 0.1075437 | 0.1553076 | 0.1164432 | 0.0871634 | -0.0041300 | 0.1520074 | 0.1226735 | 0.0526521 | -0.0066073 | 0.0718411 | 0.0657489 | 0.0470214 | 0.0260886 | -0.0036155 | -0.1337171 | -0.1030801 | NA | NA | NA | NA | NA | NA | NA | NA |
| 299261_Roseobacteraceae; Tateyamaria | -0.0738414 | -0.0591069 | -0.0255825 | -0.0193580 | -0.0171259 | -0.1178348 | 0.0757594 | 0.1274685 | 0.0640748 | 0.0371472 | 0.1802298 | 0.1250976 | 0.0949288 | 0.1063536 | 0.1792728 | 0.0006654 | 0.0939995 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1759396_Rhodobacteraceae; Marivivens | -0.0400963 | -0.0286964 | 0.0052722 | 0.0038144 | 0.0007956 | -0.0927575 | 0.0839321 | 0.1195700 | 0.0586383 | 0.0279550 | 0.1525329 | 0.1084570 | 0.0818342 | 0.0873573 | 0.1395102 | -0.0212323 | 0.0566904 | NA | NA | NA | NA | NA | NA | NA | NA |
| 2433_Roseobacteraceae; Roseobacter | -0.0606787 | -0.0485497 | -0.0209034 | -0.0158184 | -0.0140104 | -0.0970085 | 0.0625203 | 0.1050588 | 0.0528010 | 0.0305738 | 0.1484418 | 0.1030496 | 0.0781950 | 0.0875791 | 0.1475779 | 0.0004128 | 0.0772785 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1217416_Halieaceae; Halioglobus | -0.0642773 | -0.0480723 | -0.0032180 | -0.0026090 | -0.0048719 | -0.1311760 | 0.1084897 | 0.1610164 | 0.0794809 | 0.0400740 | 0.2112319 | 0.1491848 | 0.1127427 | 0.1220111 | 0.1979693 | -0.0208875 | 0.0874394 | NA | NA | NA | NA | NA | NA | NA | NA |
| 75787_Rhodocyclaceae; Unknown Genus | -0.1069751 | -0.0864884 | -0.0419069 | -0.0316661 | -0.0273628 | -0.1634345 | 0.0989342 | 0.1719347 | 0.0867973 | 0.0518478 | 0.2472803 | 0.1709841 | 0.1298673 | 0.1465907 | 0.2490550 | 0.0064234 | 0.1347504 | NA | NA | NA | NA | NA | NA | NA | NA |
| 475794_Halieaceae; Haliea | -0.1178646 | -0.0992655 | -0.0685726 | -0.0516347 | -0.0419481 | -0.1464399 | 0.0589840 | 0.1305795 | 0.0677607 | 0.0480272 | 0.2085534 | 0.1410170 | 0.1076851 | 0.1269065 | 0.2251232 | 0.0323174 | 0.1418653 | NA | NA | NA | NA | NA | NA | NA | NA |
| 518755_Verrucomicrobiaceae; Roseibacillus | -0.1836952 | -0.1803605 | -0.2515023 | -0.1885926 | -0.1415658 | -0.0110818 | -0.2310446 | -0.1765131 | -0.0743562 | 0.0161063 | -0.0875407 | -0.0860918 | -0.0607508 | -0.0256566 | 0.0326453 | 0.2133362 | 0.1784742 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1892252_Microcoleaceae; Unknown Genus | -0.1275075 | -0.1239053 | -0.1673150 | -0.1254803 | -0.0944404 | -0.0185912 | -0.1441635 | -0.1034482 | -0.0425799 | 0.0141283 | -0.0400564 | -0.0444064 | -0.0306959 | -0.0065939 | 0.0386316 | 0.1399025 | 0.1260230 | NA | NA | NA | NA | NA | NA | NA | NA |
| 267893_Idiomarinaceae; Unknown Genus | -0.1899227 | -0.1796115 | -0.2213320 | -0.1660584 | -0.1259804 | -0.0695568 | -0.1524642 | -0.0808193 | -0.0287269 | 0.0323700 | 0.0198788 | -0.0071731 | -0.0016523 | 0.0332572 | 0.1188923 | 0.1769654 | 0.1959297 | NA | NA | NA | NA | NA | NA | NA | NA |
| 119045_Methylobacteriaceae; Unknown Genus | -0.2961772 | -0.2680004 | -0.2769560 | -0.2079765 | -0.1605335 | -0.2108717 | -0.0854583 | 0.0531729 | 0.0400664 | 0.0781823 | 0.2255584 | 0.1330528 | 0.1052149 | 0.1572324 | 0.3354683 | 0.1991200 | 0.3256465 | NA | NA | NA | NA | NA | NA | NA | NA |
| 150830_Stappiaceae; Roseibium | -0.3319441 | -0.2997781 | -0.3070950 | -0.2306201 | -0.1781779 | -0.2413018 | -0.0883941 | 0.0682830 | 0.0490196 | 0.0889669 | 0.2622304 | 0.1561138 | 0.1231471 | 0.1813289 | 0.3832556 | 0.2194400 | 0.3659467 | NA | NA | NA | NA | NA | NA | NA | NA |
| 478070_Stappiaceae; Labrenzia | -0.3631634 | -0.3298811 | -0.3467399 | -0.2603556 | -0.2006051 | -0.2478369 | -0.1207417 | 0.0464252 | 0.0402378 | 0.0929626 | 0.2561909 | 0.1480347 | 0.1177190 | 0.1817551 | 0.3956205 | 0.2522057 | 0.3971918 | NA | NA | NA | NA | NA | NA | NA | NA |
# I want to save this as a csv:
write.csv(MINT_sPLS_mat.corr_and_LOGOCV, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/MINT_sPLS_sim_and LOGOCV_stab_scores_taxa.csv", quote = F, row.names = F)
# First extracting similarity values from the MINT sPLS heatmap
MINT_sPLS_mat.corr.GOs <- left_join(cim_mint.spls2.WQ.GOs[["mat"]] %>%
as.data.frame() %>%
rownames_to_column("OTU"),
megan_go_clr_5@tax_table %>% # Adding taxonomy info too
as.data.frame() %>%
rownames_to_column("OTU")
)
# Now adding the stability scores!
MINT_sPLS_mat.corr_and_LOGOCV_GOs <- left_join(MINT_sPLS_mat.corr.GOs, RawGraphs_shared_GOs_MINT[,1:2])
# Visualising as a table
knitr::kable(MINT_sPLS_mat.corr_and_LOGOCV_GOs, caption = "MINT sPLS - numerical representation of similarity scores (partial correlations).")
| OTU | median_PN_µM | median_Chlorophyll_A_µg_L | median_POC_µM | SALINITY_2.5m_RV | median_PP_µM | median_Phaeophytin_A_µg_L | median_TSS_mg_L | median_TDN_µM | median_DOC_µM | SEAWATER_TEMPERATURE_2.5m_RV | FLUORESCENCE_2.5m_RV | median_Si_µM | median_NH4_µM | median_NO2_µM | median_NO3_µM | median_PO4_µM | median_TDP_µM | Rank1 | Rank2 | Rank3 | Rank4 | Rank5 | Rank6 | Freq |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4131_IPR004131 Pyrophosphate-energised proton pump | 0.1679383 | 0.1696877 | 0.1338804 | 0.3330083 | 0.2635126 | 0.2460769 | 0.1902344 | 0.0070085 | -0.2092062 | 0.0599995 | 0.0114677 | -0.0781077 | 0.2051473 | 0.2799049 | 0.1734274 | 0.0265416 | -0.1786604 | NA | NA | NA | NA | NA | NA | NA |
| 11400_IPR011400 Eukaryotic translation initiation factor 3 subunit B | 0.1662056 | 0.1679843 | 0.1318197 | 0.3311321 | 0.2620205 | 0.2452760 | 0.1901574 | 0.0059766 | -0.2105404 | 0.0599004 | 0.0114952 | -0.0781252 | 0.2058323 | 0.2810541 | 0.1746662 | 0.0280886 | -0.1772776 | NA | NA | NA | NA | NA | NA | NA |
| 7276_IPR007276 Nucleolar protein 14 | 0.1655500 | 0.1673781 | 0.1304902 | 0.3316845 | 0.2624486 | 0.2463796 | 0.1916546 | 0.0048095 | -0.2138720 | 0.0602837 | 0.0116235 | -0.0787983 | 0.2083589 | 0.2847564 | 0.1775857 | 0.0301476 | -0.1771271 | NA | NA | NA | NA | NA | NA | NA |
| 20164_IPR020164 Cytochrome c oxidase assembly protein COX16 | 0.1726935 | 0.1743167 | 0.1401913 | 0.3366515 | 0.2664237 | 0.2465971 | 0.1886251 | 0.0107667 | -0.2021732 | 0.0597697 | 0.0112517 | -0.0772645 | 0.2005630 | 0.2728537 | 0.1671045 | 0.0205383 | -0.1820106 | NA | NA | NA | NA | NA | NA | NA |
| 17423_IPR017423 tRNA (adenine(58)-N(1))-methyltransferase non-catalytic subunit TRM6 | 0.1888808 | 0.1904263 | 0.1566282 | 0.3606394 | 0.2854451 | 0.2612787 | 0.1971555 | 0.0164320 | -0.2041769 | 0.0628495 | 0.0115992 | -0.0805114 | 0.2057691 | 0.2788406 | 0.1680750 | 0.0136286 | -0.1968364 | NA | NA | NA | NA | NA | NA | NA |
| 27108_IPR027108 Pre-mRNA-processing factor 6/Prp1/STA1 | 0.1869891 | 0.1885140 | 0.1551326 | 0.3568597 | 0.2824543 | 0.2584749 | 0.1949780 | 0.0163707 | -0.2017563 | 0.0621641 | 0.0114673 | -0.0796164 | 0.2034068 | 0.2756136 | 0.1660658 | 0.0132962 | -0.1948155 | NA | NA | NA | NA | NA | NA | NA |
| 560_NA | 0.1725739 | 0.1743615 | 0.1377201 | 0.3418688 | 0.2705257 | 0.2525000 | 0.1950847 | 0.0074058 | -0.2142388 | 0.0615452 | 0.0117533 | -0.0800887 | 0.2102147 | 0.2867733 | 0.1775711 | 0.0268874 | -0.1834940 | NA | NA | NA | NA | NA | NA | NA |
| 2671_GO:0006412 translation | 0.1685998 | 0.1704453 | 0.1331265 | 0.3372611 | 0.2668637 | 0.2503237 | 0.1945400 | 0.0052264 | -0.2166168 | 0.0612163 | 0.0117878 | -0.0799682 | 0.2112386 | 0.2886208 | 0.1798211 | 0.0300799 | -0.1802326 | NA | NA | NA | NA | NA | NA | NA |
| 30661_IPR030661 SUMO-activating enzyme subunit Uba2 | 0.1680451 | 0.1699363 | 0.1319479 | 0.3378523 | 0.2673232 | 0.2513953 | 0.1959562 | 0.0041629 | -0.2197125 | 0.0615818 | 0.0119079 | -0.0806030 | 0.2135985 | 0.2920733 | 0.1825313 | 0.0319664 | -0.1801419 | NA | NA | NA | NA | NA | NA | NA |
| 31120_IPR031120 WD repeat HIR1 | 0.1468733 | 0.1488188 | 0.1111284 | 0.3049193 | 0.2412190 | 0.2304562 | 0.1829147 | -0.0022880 | -0.2136018 | 0.0570342 | 0.0113078 | -0.0755335 | 0.2039899 | 0.2802088 | 0.1782362 | 0.0391844 | -0.1602905 | NA | NA | NA | NA | NA | NA | NA |
| 8576_GO:0006807 nitrogen compound metabolic process | 0.1432243 | 0.1451537 | 0.1079036 | 0.2984084 | 0.2360634 | 0.2259171 | 0.1796577 | -0.0028863 | -0.2106794 | 0.0559721 | 0.0111264 | -0.0742191 | 0.2008345 | 0.2760033 | 0.1758753 | 0.0394540 | -0.1566226 | NA | NA | NA | NA | NA | NA | NA |
| 7720_GO:0009058 biosynthetic process | 0.1575777 | 0.1593335 | 0.1239804 | 0.3162304 | 0.2502178 | 0.2350927 | 0.1830522 | 0.0042588 | -0.2047339 | 0.0575535 | 0.0111122 | -0.0752775 | 0.1992565 | 0.2723858 | 0.1700406 | 0.0293012 | -0.1687505 | NA | NA | NA | NA | NA | NA | NA |
| 7704_GO:0009058 biosynthetic process | 0.1591551 | 0.1607514 | 0.1277621 | 0.3135629 | 0.2481352 | 0.2309461 | 0.1778318 | 0.0078903 | -0.1937211 | 0.0561852 | 0.0106783 | -0.0729514 | 0.1907735 | 0.2600139 | 0.1604178 | 0.0227847 | -0.1687172 | NA | NA | NA | NA | NA | NA | NA |
| 4299_NA | 0.1070626 | 0.1096284 | 0.0645461 | 0.2600611 | 0.2055564 | 0.2101022 | 0.1790285 | -0.0249190 | -0.2403320 | 0.0541723 | 0.0117744 | -0.0750123 | 0.2165804 | 0.3020784 | 0.2032972 | 0.0726825 | -0.1280028 | NA | NA | NA | NA | NA | NA | NA |
| 2312_GO:0044281 small molecule metabolic process | 0.1053821 | 0.1080028 | 0.0621680 | 0.2591129 | 0.2047945 | 0.2102966 | 0.1800076 | -0.0264560 | -0.2435776 | 0.0543667 | 0.0118825 | -0.0754894 | 0.2188101 | 0.3054488 | 0.2061907 | 0.0752003 | -0.1269191 | NA | NA | NA | NA | NA | NA | NA |
| 26872_IPR026872 Protein farnesyltransferase subunit beta | 0.1135313 | 0.1160776 | 0.0709489 | 0.2700273 | 0.2134567 | 0.2163930 | 0.1828972 | -0.0228891 | -0.2419843 | 0.0555298 | 0.0119488 | -0.0765105 | 0.2193439 | 0.3054552 | 0.2044233 | 0.0703652 | -0.1340398 | NA | NA | NA | NA | NA | NA | NA |
| 8733_IPR008733 Peroxisomal biogenesis factor 11 | 0.1209279 | 0.1233341 | 0.0799601 | 0.2775433 | 0.2194390 | 0.2192627 | 0.1826297 | -0.0181808 | -0.2351854 | 0.0557887 | 0.0117856 | -0.0761753 | 0.2155346 | 0.2992736 | 0.1981782 | 0.0631861 | -0.1397969 | NA | NA | NA | NA | NA | NA | NA |
| 6084_NA | 0.1173802 | 0.1199706 | 0.0739605 | 0.2777900 | 0.2195987 | 0.2221782 | 0.1874149 | -0.0228087 | -0.2470713 | 0.0569484 | 0.0122238 | -0.0783696 | 0.2242801 | 0.3122081 | 0.2086515 | 0.0711259 | -0.1381730 | NA | NA | NA | NA | NA | NA | NA |
| 22878_IPR022878 V-type ATP synthase catalytic alpha chain | 0.1168367 | 0.1194122 | 0.0736599 | 0.2764076 | 0.2185063 | 0.2210423 | 0.1864309 | -0.0226440 | -0.2457120 | 0.0566527 | 0.0121582 | -0.0779559 | 0.2230689 | 0.3105136 | 0.2074987 | 0.0706843 | -0.1375048 | NA | NA | NA | NA | NA | NA | NA |
| 5946_GO:0009058 biosynthetic process | 0.1229914 | 0.1254597 | 0.0810219 | 0.2829738 | 0.2237297 | 0.2237781 | 0.1865859 | -0.0189184 | -0.2407540 | 0.0569722 | 0.0120516 | -0.0778419 | 0.2204601 | 0.3061783 | 0.2029085 | 0.0650752 | -0.1423875 | NA | NA | NA | NA | NA | NA | NA |
| 1948_GO:0046872 metal ion binding | 0.1411160 | 0.1432814 | 0.1025232 | 0.3027221 | 0.2394355 | 0.2322933 | 0.1875398 | -0.0082003 | -0.2270744 | 0.0580504 | 0.0117762 | -0.0777232 | 0.2135168 | 0.2944762 | 0.1901897 | 0.0490372 | -0.1568880 | NA | NA | NA | NA | NA | NA | NA |
| 3197_GO:0006091 generation of precursor metabolites and energy | 0.1392345 | 0.1414313 | 0.1002926 | 0.3006687 | 0.2378024 | 0.2314056 | 0.1874367 | -0.0093109 | -0.2284870 | 0.0579373 | 0.0118045 | -0.0777337 | 0.2142316 | 0.2956823 | 0.1915036 | 0.0506982 | -0.1553817 | NA | NA | NA | NA | NA | NA | NA |
| 17132_IPR017132 Sm-like protein Lsm7 | 0.1440792 | 0.1462216 | 0.1056588 | 0.3068224 | 0.2426887 | 0.2346566 | 0.1887496 | -0.0069843 | -0.2267897 | 0.0585172 | 0.0118126 | -0.0781639 | 0.2139472 | 0.2948191 | 0.1898025 | 0.0474327 | -0.1595161 | NA | NA | NA | NA | NA | NA | NA |
| 27097_IPR027097 Mitotic spindle checkpoint protein Mad2 | 0.1481427 | 0.1501719 | 0.1111289 | 0.3097586 | 0.2450371 | 0.2349039 | 0.1871604 | -0.0036637 | -0.2203833 | 0.0582618 | 0.0116115 | -0.0773499 | 0.2097118 | 0.2883354 | 0.1840556 | 0.0420960 | -0.1623267 | NA | NA | NA | NA | NA | NA | NA |
| 8384_GO:0065003 protein-containing complex assembly | 0.1199790 | 0.1221998 | 0.0817208 | 0.2698826 | 0.2134051 | 0.2114324 | 0.1745672 | -0.0146648 | -0.2210617 | 0.0535233 | 0.0111808 | -0.0726828 | 0.2039947 | 0.2827329 | 0.1859779 | 0.0562893 | -0.1370809 | NA | NA | NA | NA | NA | NA | NA |
| 4567_GO:0009058 biosynthetic process | 0.0914410 | 0.0938162 | 0.0524922 | 0.2281671 | 0.1803229 | 0.1861898 | 0.1602235 | -0.0250065 | -0.2188171 | 0.0482853 | 0.0106219 | -0.0672623 | 0.1958495 | 0.2736667 | 0.1853836 | 0.0691425 | -0.1111129 | NA | NA | NA | NA | NA | NA | NA |
| 362_NA | 0.2046618 | 0.2056808 | 0.1791181 | 0.3691809 | 0.2923147 | 0.2590503 | 0.1875235 | 0.0310881 | -0.1728888 | 0.0609037 | 0.0105507 | -0.0758395 | 0.1841803 | 0.2462534 | 0.1402005 | -0.0104376 | -0.2069066 | NA | NA | NA | NA | NA | NA | NA |
| 25655_IPR025655 Peroxisomal membrane protein 14 | 0.2002543 | 0.2013311 | 0.1741162 | 0.3638582 | 0.2880862 | 0.2563997 | 0.1866623 | 0.0288018 | -0.1750501 | 0.0604681 | 0.0105691 | -0.0755936 | 0.1849340 | 0.2477521 | 0.1422831 | -0.0071451 | -0.2032267 | NA | NA | NA | NA | NA | NA | NA |
| 7315_GO:0009058 biosynthetic process | 0.2075643 | 0.2084592 | 0.1836455 | 0.3698545 | 0.2928725 | 0.2576404 | 0.1846669 | 0.0343358 | -0.1651247 | 0.0602467 | 0.0102740 | -0.0745064 | 0.1785977 | 0.2379370 | 0.1333315 | -0.0159116 | -0.2084938 | NA | NA | NA | NA | NA | NA | NA |
| 30468_IPR030468 NEDD8-activating enzyme E1 catalytic subunit | 0.1904936 | 0.1917326 | 0.1625507 | 0.3531918 | 0.2796035 | 0.2517792 | 0.1861108 | 0.0230490 | -0.1823469 | 0.0598771 | 0.0107145 | -0.0756410 | 0.1886171 | 0.2539731 | 0.1490719 | 0.0014554 | -0.1954085 | NA | NA | NA | NA | NA | NA | NA |
| 6846_GO:0006412 translation | 0.1845944 | 0.1858245 | 0.1570943 | 0.3432243 | 0.2717077 | 0.2450631 | 0.1815203 | 0.0217384 | -0.1788724 | 0.0583462 | 0.0104733 | -0.0738107 | 0.1845185 | 0.2486191 | 0.1463389 | 0.0025429 | -0.1896436 | NA | NA | NA | NA | NA | NA | NA |
| 16656_GO:0009058 biosynthetic process | 0.2019918 | 0.2031821 | 0.1741336 | 0.3704437 | 0.2932821 | 0.2624448 | 0.1924273 | 0.0269424 | -0.1842464 | 0.0621356 | 0.0109812 | -0.0780596 | 0.1926969 | 0.2587761 | 0.1501740 | -0.0032045 | -0.2060025 | NA | NA | NA | NA | NA | NA | NA |
| 2738_IPR002738 RNase P subunit p30 | 0.1995368 | 0.2007823 | 0.1710191 | 0.3682330 | 0.2915198 | 0.2618088 | 0.1928594 | 0.0252049 | -0.1871389 | 0.0621443 | 0.0110619 | -0.0783208 | 0.1944714 | 0.2615629 | 0.1527982 | -0.0004902 | -0.2041755 | NA | NA | NA | NA | NA | NA | NA |
| 11603_IPR011603 2-oxoglutarate dehydrogenase E1 component | 0.2081109 | 0.2092116 | 0.1812117 | 0.3775266 | 0.2989114 | 0.2657828 | 0.1932519 | 0.0303052 | -0.1805588 | 0.0626381 | 0.0109270 | -0.0782389 | 0.1910994 | 0.2559012 | 0.1466870 | -0.0081339 | -0.2110208 | NA | NA | NA | NA | NA | NA | NA |
| 6886_GO:0006351 transcription, DNA-templated | 0.2451175 | 0.2454221 | 0.2276594 | 0.4120010 | 0.3263807 | 0.2766529 | 0.1881296 | 0.0557869 | -0.1395339 | 0.0628903 | 0.0098182 | -0.0749094 | 0.1664193 | 0.2168725 | 0.1093653 | -0.0477064 | -0.2389009 | NA | NA | NA | NA | NA | NA | NA |
| 12762_GO:0009058 biosynthetic process | 0.2403300 | 0.2408130 | 0.2205694 | 0.4100234 | 0.3247792 | 0.2780128 | 0.1917936 | 0.0509631 | -0.1503999 | 0.0636851 | 0.0101936 | -0.0766507 | 0.1740709 | 0.2283469 | 0.1190130 | -0.0396890 | -0.2360271 | NA | NA | NA | NA | NA | NA | NA |
| 511_IPR000511 Cytochrome c/c1 haem-lyase | 0.2293119 | 0.2300678 | 0.2062249 | 0.4009425 | 0.3175313 | 0.2760950 | 0.1947495 | 0.0426483 | -0.1652643 | 0.0640045 | 0.0106343 | -0.0782668 | 0.1835446 | 0.2430301 | 0.1324224 | -0.0265255 | -0.2280757 | NA | NA | NA | NA | NA | NA | NA |
| 2755_GO:0009058 biosynthetic process | 0.2044722 | 0.2052060 | 0.1830277 | 0.3594818 | 0.2846851 | 0.2483834 | 0.1760362 | 0.0368162 | -0.1517751 | 0.0577282 | 0.0096665 | -0.0708290 | 0.1672021 | 0.2218050 | 0.1219038 | -0.0213517 | -0.2039518 | NA | NA | NA | NA | NA | NA | NA |
| 9244_IPR009244 Mediator complex, subunit Med7 | 0.2449597 | 0.2450257 | 0.2309328 | 0.4038840 | 0.3199956 | 0.2677254 | 0.1785149 | 0.0605818 | -0.1218613 | 0.0602325 | 0.0090781 | -0.0707158 | 0.1522085 | 0.1964091 | 0.0940507 | -0.0568421 | -0.2364285 | NA | NA | NA | NA | NA | NA | NA |
| 29751_GO:0006412 translation | 0.2473117 | 0.2472180 | 0.2354499 | 0.4024816 | 0.3189154 | 0.2644120 | 0.1738446 | 0.0644121 | -0.1112071 | 0.0590506 | 0.0086718 | -0.0686070 | 0.1441853 | 0.1846272 | 0.0847020 | -0.0635523 | -0.2371394 | NA | NA | NA | NA | NA | NA | NA |
| 2994_IPR002994 Surfeit locus 1/Shy1 | 0.2594934 | 0.2594860 | 0.2457420 | 0.4253033 | 0.3369809 | 0.2807755 | 0.1860310 | 0.0657410 | -0.1233957 | 0.0629582 | 0.0093790 | -0.0735685 | 0.1566704 | 0.2014784 | 0.0946925 | -0.0631840 | -0.2497049 | NA | NA | NA | NA | NA | NA | NA |
| 10971_GO:0009058 biosynthetic process | 0.1758571 | 0.1767942 | 0.1530261 | 0.3192473 | 0.2527668 | 0.2248484 | 0.1635805 | 0.0254661 | -0.1530929 | 0.0530073 | 0.0092551 | -0.0662352 | 0.1618974 | 0.2168392 | 0.1244018 | -0.0066033 | -0.1783842 | NA | NA | NA | NA | NA | NA | NA |
| 5279_IPR000109 Proton-dependent oligopeptide transporter family | 0.1722869 | 0.1730662 | 0.1519089 | 0.3081983 | 0.2440430 | 0.2151937 | 0.1547368 | 0.0277594 | -0.1397562 | 0.0504086 | 0.0086404 | -0.0624790 | 0.1504059 | 0.2006137 | 0.1130080 | -0.0118018 | -0.1734139 | NA | NA | NA | NA | NA | NA | NA |
| 16461_GO:0016740 transferase activity | 0.1412733 | 0.1423577 | 0.1181772 | 0.2673818 | 0.2116438 | 0.1927954 | 0.1446109 | 0.0137413 | -0.1474329 | 0.0462221 | 0.0084552 | -0.0589733 | 0.1496687 | 0.2024543 | 0.1211329 | 0.0074403 | -0.1465273 | NA | NA | NA | NA | NA | NA | NA |
| 13025_GO:0006412 translation | 0.3561798 | 0.3509502 | 0.4121626 | 0.4119027 | 0.3273750 | 0.1938749 | 0.0475415 | 0.1959769 | 0.2154956 | 0.0291256 | -0.0031875 | -0.0102409 | -0.0936700 | -0.1683203 | -0.2037072 | -0.2873700 | -0.2919920 | NA | NA | NA | NA | NA | NA | NA |
| 2141_IPR002141 Influenza virus nucleoprotein (NP) | 0.3365515 | 0.3315044 | 0.3909663 | 0.3857206 | 0.3065949 | 0.1793097 | 0.0407103 | 0.1873199 | 0.2114196 | 0.0263595 | -0.0033373 | -0.0078377 | -0.0947644 | -0.1680600 | -0.1992436 | -0.2755999 | -0.2748725 | NA | NA | NA | NA | NA | NA | NA |
| 27185_IPR017241 Toll-like receptor | 0.3583054 | 0.3528018 | 0.4181063 | 0.4063620 | 0.3230382 | 0.1861186 | 0.0381538 | 0.2020677 | 0.2346937 | 0.0266331 | -0.0039540 | -0.0060790 | -0.1085969 | -0.1900294 | -0.2204528 | -0.2984232 | -0.2913726 | NA | NA | NA | NA | NA | NA | NA |
| 5133_IPR005133 Na+/H+ antiporter subunit G | 0.3909856 | 0.3851747 | 0.4534489 | 0.4498353 | 0.3575428 | 0.2102368 | 0.0493840 | 0.2165541 | 0.2417455 | 0.0311989 | -0.0037156 | -0.0100176 | -0.1069879 | -0.1907697 | -0.2281147 | -0.3181584 | -0.3198408 | NA | NA | NA | NA | NA | NA | NA |
| 2132_GO:0006412 translation | 0.3825095 | 0.3768380 | 0.4434245 | 0.4405289 | 0.3501420 | 0.2061756 | 0.0488522 | 0.2115853 | 0.2355069 | 0.0306711 | -0.0035934 | -0.0100357 | -0.1038681 | -0.1854806 | -0.2223043 | -0.3107409 | -0.3130386 | NA | NA | NA | NA | NA | NA | NA |
| 4769_IPR004769 Adenylosuccinate lyase | 0.3378194 | 0.3321185 | 0.4015439 | 0.3662701 | 0.2913105 | 0.1566916 | 0.0155891 | 0.2008866 | 0.2590264 | 0.0194908 | -0.0053033 | 0.0031691 | -0.1326695 | -0.2228012 | -0.2405789 | -0.3010419 | -0.2697352 | NA | NA | NA | NA | NA | NA | NA |
| 22270_GO:0016491 oxidoreductase activity | 0.3297492 | 0.3242848 | 0.3905109 | 0.3608274 | 0.2869522 | 0.1566337 | 0.0192154 | 0.1940528 | 0.2454326 | 0.0201276 | -0.0048676 | 0.0013473 | -0.1235596 | -0.2089182 | -0.2284107 | -0.2899893 | -0.2642680 | NA | NA | NA | NA | NA | NA | NA |
| 10226_IPR010226 NADH-quinone oxidoreductase, chain I | 0.3577567 | 0.3518360 | 0.4235679 | 0.3917303 | 0.3115258 | 0.1702224 | 0.0211566 | 0.2103776 | 0.2657061 | 0.0219224 | -0.0052571 | 0.0013267 | -0.1335950 | -0.2260010 | -0.2473146 | -0.3143213 | -0.2867894 | NA | NA | NA | NA | NA | NA | NA |
| 5150_GO:0009058 biosynthetic process | 0.4374276 | 0.4312770 | 0.5022819 | 0.5148125 | 0.4090918 | 0.2480740 | 0.0692089 | 0.2351738 | 0.2446074 | 0.0387532 | -0.0030781 | -0.0173028 | -0.0989586 | -0.1835461 | -0.2327959 | -0.3424718 | -0.3612413 | NA | NA | NA | NA | NA | NA | NA |
| 1036_IPR001036 Acriflavin resistance protein | 0.3982781 | 0.3914287 | 0.4752459 | 0.4275992 | 0.3401257 | 0.1800303 | 0.0132750 | 0.2394360 | 0.3148369 | 0.0215718 | -0.0066469 | 0.0059650 | -0.1639957 | -0.2736023 | -0.2918306 | -0.3598468 | -0.3167625 | NA | NA | NA | NA | NA | NA | NA |
| 14358_IPR014358 Enoyl-[acyl-carrier-protein] reductase (NADH) | 0.3871416 | 0.3805612 | 0.4608452 | 0.4181960 | 0.3326232 | 0.1778415 | 0.0159908 | 0.2311701 | 0.3003161 | 0.0218197 | -0.0062225 | 0.0044502 | -0.1548239 | -0.2593431 | -0.2787135 | -0.3468042 | -0.3086592 | NA | NA | NA | NA | NA | NA | NA |
| 1591_IPR001591 Influenza RNA-dependent RNA polymerase subunit PB2 | 0.4072700 | 0.4008344 | 0.4778228 | 0.4559710 | 0.3625253 | 0.2049528 | 0.0362065 | 0.2333255 | 0.2800291 | 0.0282983 | -0.0050478 | -0.0037827 | -0.1340762 | -0.2313289 | -0.2620781 | -0.3461190 | -0.3294414 | NA | NA | NA | NA | NA | NA | NA |
| 20948_NA | 0.2950401 | 0.2887744 | 0.3691477 | 0.2775219 | 0.2211022 | 0.0896395 | -0.0376096 | 0.2015132 | 0.3210966 | 0.0029002 | -0.0085909 | 0.0251353 | -0.1919687 | -0.3042492 | -0.2923675 | -0.3123794 | -0.2230670 | NA | NA | NA | NA | NA | NA | NA |
| 5670_IPR005670 Phosphate transport system permease protein 1 | 0.2865337 | 0.2803796 | 0.3594960 | 0.2672447 | 0.2129377 | 0.0845190 | -0.0392771 | 0.1971035 | 0.3169354 | 0.0020579 | -0.0085559 | 0.0256122 | -0.1905221 | -0.3013683 | -0.2883569 | -0.3060300 | -0.2159636 | NA | NA | NA | NA | NA | NA | NA |
| 453_GO:0009058 biosynthetic process | 0.3055940 | 0.2992792 | 0.3798436 | 0.2932097 | 0.2335416 | 0.0992650 | -0.0319901 | 0.2051774 | 0.3196831 | 0.0049241 | -0.0083599 | 0.0229932 | -0.1884886 | -0.3002220 | -0.2916420 | -0.3168287 | -0.2327474 | NA | NA | NA | NA | NA | NA | NA |
| 2381_IPR002381 Ribonuclease PH, bacterial-type | 0.2863774 | 0.2803739 | 0.3571878 | 0.2719482 | 0.2166352 | 0.0898765 | -0.0333924 | 0.1940125 | 0.3059035 | 0.0036732 | -0.0080980 | 0.0230380 | -0.1817077 | -0.2886520 | -0.2787847 | -0.3002019 | -0.2172778 | NA | NA | NA | NA | NA | NA | NA |
| 11864_IPR011864 Phosphate ABC transporter, permease protein PstC | 0.3299073 | 0.3232990 | 0.4070676 | 0.3234176 | 0.2575331 | 0.1148292 | -0.0262166 | 0.2172686 | 0.3297108 | 0.0076093 | -0.0083820 | 0.0211902 | -0.1911267 | -0.3062995 | -0.3014877 | -0.3340039 | -0.2532966 | NA | NA | NA | NA | NA | NA | NA |
| 1062_IPR001062 Transcription antitermination protein, NusG | 0.3125751 | 0.3060040 | 0.3901268 | 0.2962206 | 0.2359769 | 0.0974239 | -0.0371791 | 0.2121331 | 0.3352430 | 0.0038075 | -0.0088954 | 0.0254651 | -0.1994176 | -0.3166246 | -0.3054631 | -0.3283710 | -0.2369755 | NA | NA | NA | NA | NA | NA | NA |
| 3764_IPR003764 N-acetylglucosamine-6-phosphate deacetylase | 0.2661621 | 0.2602801 | 0.3363098 | 0.2427970 | 0.1935144 | 0.0724396 | -0.0430713 | 0.1864418 | 0.3066015 | 0.0000957 | -0.0084567 | 0.0266674 | -0.1867618 | -0.2940429 | -0.2784323 | -0.2906321 | -0.1990007 | NA | NA | NA | NA | NA | NA | NA |
| 6130_GO:0006807 nitrogen compound metabolic process | 0.2632171 | 0.2574362 | 0.3320720 | 0.2412965 | 0.1923059 | 0.0729596 | -0.0411607 | 0.1836491 | 0.3005530 | 0.0004900 | -0.0082523 | 0.0257461 | -0.1825649 | -0.2877194 | -0.2730488 | -0.2860317 | -0.1971490 | NA | NA | NA | NA | NA | NA | NA |
| 5982_IPR005982 Thioredoxin reductase | 0.2391295 | 0.2334546 | 0.3077498 | 0.2052868 | 0.1637539 | 0.0507625 | -0.0542344 | 0.1754122 | 0.3042384 | -0.0041977 | -0.0087987 | 0.0307435 | -0.1908763 | -0.2974417 | -0.2751030 | -0.2761165 | -0.1749945 | NA | NA | NA | NA | NA | NA | NA |
| 7016_NA | 0.2539173 | 0.2477384 | 0.3289766 | 0.2129411 | 0.1699163 | 0.0482847 | -0.0636829 | 0.1893609 | 0.3343402 | -0.0061375 | -0.0098138 | 0.0353060 | -0.2117343 | -0.3288829 | -0.3019018 | -0.2990761 | -0.1843277 | NA | NA | NA | NA | NA | NA | NA |
| 5338_GO:0006807 nitrogen compound metabolic process | 0.2474970 | 0.2415339 | 0.3198046 | 0.2095174 | 0.1671618 | 0.0492484 | -0.0597023 | 0.1833668 | 0.3214963 | -0.0053288 | -0.0093825 | 0.0333782 | -0.2028591 | -0.3154926 | -0.2904620 | -0.2892253 | -0.1802459 | NA | NA | NA | NA | NA | NA | NA |
| 114_GO:0006412 translation | 0.3138185 | 0.3077303 | 0.3843772 | 0.3141625 | 0.2500992 | 0.1164916 | -0.0170581 | 0.2026632 | 0.2990371 | 0.0094107 | -0.0073642 | 0.0167159 | -0.1700993 | -0.2744923 | -0.2741314 | -0.3101067 | -0.2428684 | NA | NA | NA | NA | NA | NA | NA |
| 394_IPR000394 RNA polymerase sigma factor 54 | 0.3027741 | 0.2968363 | 0.3717663 | 0.3010014 | 0.2396421 | 0.1100466 | -0.0190025 | 0.1968256 | 0.2932259 | 0.0083779 | -0.0073017 | 0.0172388 | -0.1678933 | -0.2702796 | -0.2685699 | -0.3016500 | -0.2336995 | NA | NA | NA | NA | NA | NA | NA |
| 8141_GO:0006807 nitrogen compound metabolic process | 0.3278833 | 0.3216942 | 0.3991402 | 0.3339003 | 0.2657576 | 0.1280169 | -0.0109821 | 0.2082655 | 0.2997705 | 0.0117183 | -0.0071656 | 0.0144781 | -0.1675606 | -0.2721505 | -0.2754335 | -0.3174005 | -0.2554241 | NA | NA | NA | NA | NA | NA | NA |
| 4846_IPR004846 Type II/III secretion system | 0.3075576 | 0.3018869 | 0.3724644 | 0.3176391 | 0.2527729 | 0.1250259 | -0.0049360 | 0.1926248 | 0.2712504 | 0.0124711 | -0.0063067 | 0.0112378 | -0.1492026 | -0.2437934 | -0.2497437 | -0.2925441 | -0.2409006 | NA | NA | NA | NA | NA | NA | NA |
| 529_GO:0006412 translation | 0.2817798 | 0.2761459 | 0.3475339 | 0.2765810 | 0.2202343 | 0.0984612 | -0.0219760 | 0.1853613 | 0.2808412 | 0.0066140 | -0.0071271 | 0.0179172 | -0.1626265 | -0.2607250 | -0.2568378 | -0.2848769 | -0.2164468 | NA | NA | NA | NA | NA | NA | NA |
| 17714_GO:0009058 biosynthetic process | 0.2943836 | 0.2894995 | 0.3487120 | 0.3219373 | 0.2560261 | 0.1396215 | 0.0169232 | 0.1733584 | 0.2195385 | 0.0179051 | -0.0043634 | 0.0013038 | -0.1106516 | -0.1870071 | -0.2042852 | -0.2591113 | -0.2358688 | NA | NA | NA | NA | NA | NA | NA |
| 4903_IPR004903 Lactobacillus surface layer protein | 0.2586764 | 0.2539664 | 0.3124152 | 0.2691121 | 0.2141375 | 0.1073353 | -0.0017859 | 0.1608065 | 0.2237584 | 0.0111412 | -0.0051215 | 0.0084188 | -0.1219749 | -0.1999822 | -0.2062525 | -0.2437649 | -0.2031912 | NA | NA | NA | NA | NA | NA | NA |
| 7812_IPR007812 Type II secretion system protein GspL | 0.2060827 | 0.2023567 | 0.2485166 | 0.2152664 | 0.1712834 | 0.0864812 | -0.0003712 | 0.1275765 | 0.1763166 | 0.0091659 | -0.0039989 | 0.0062479 | -0.0956130 | -0.1570709 | -0.1626290 | -0.1931877 | -0.1621355 | NA | NA | NA | NA | NA | NA | NA |
| 31723_GO:0016829 lyase activity | 0.3181549 | 0.3133740 | 0.3697332 | 0.3643196 | 0.2895867 | 0.1691553 | 0.0381020 | 0.1772755 | 0.2005723 | 0.0248131 | -0.0031845 | -0.0072421 | -0.0901533 | -0.1596934 | -0.1889675 | -0.2609048 | -0.2597538 | NA | NA | NA | NA | NA | NA | NA |
| 1971_GO:0006412 translation | 0.3132595 | 0.3085376 | 0.3642541 | 0.3582318 | 0.2847518 | 0.1660154 | 0.0369329 | 0.1748444 | 0.1985655 | 0.0242706 | -0.0031805 | -0.0068761 | -0.0896320 | -0.1584839 | -0.1869957 | -0.2574530 | -0.2556147 | NA | NA | NA | NA | NA | NA | NA |
| 456_GO:0006412 translation | 0.3024839 | 0.2978715 | 0.3524818 | 0.3441702 | 0.2735892 | 0.1583669 | 0.0335599 | 0.1698999 | 0.1956294 | 0.0228561 | -0.0032336 | -0.0057215 | -0.0896725 | -0.1575337 | -0.1839397 | -0.2506272 | -0.2463084 | NA | NA | NA | NA | NA | NA | NA |
| 4923_IPR004923 Iron permease FTR1/Fip1/EfeU | 0.3065901 | 0.3021671 | 0.3536542 | 0.3571366 | 0.2838261 | 0.1697592 | 0.0440439 | 0.1671033 | 0.1797117 | 0.0259311 | -0.0025024 | -0.0101781 | -0.0759914 | -0.1382032 | -0.1703335 | -0.2443466 | -0.2521014 | NA | NA | NA | NA | NA | NA | NA |
| 4792_NA | 0.2858812 | 0.2818996 | 0.3277208 | 0.3377098 | 0.2683482 | 0.1635259 | 0.0467473 | 0.1529268 | 0.1570562 | 0.0257451 | -0.0018945 | -0.0119701 | -0.0624227 | -0.1167119 | -0.1497101 | -0.2223593 | -0.2364598 | NA | NA | NA | NA | NA | NA | NA |
| 17665_GO:0006807 nitrogen compound metabolic process | 0.3452918 | 0.3403813 | 0.3972790 | 0.4045557 | 0.3214920 | 0.1937924 | 0.0524292 | 0.1867594 | 0.1971640 | 0.0299835 | -0.0025999 | -0.0126967 | -0.0813864 | -0.1496000 | -0.1872980 | -0.2724629 | -0.2846148 | NA | NA | NA | NA | NA | NA | NA |
| 4607_GO:0009058 biosynthetic process | 0.3321917 | 0.3273820 | 0.3834353 | 0.3863864 | 0.3070765 | 0.1832967 | 0.0470294 | 0.1814095 | 0.1960007 | 0.0279056 | -0.0027649 | -0.0107257 | -0.0833656 | -0.1512259 | -0.1856688 | -0.2654191 | -0.2729838 | NA | NA | NA | NA | NA | NA | NA |
| 218_GO:0006412 translation | 0.3496593 | 0.3448321 | 0.4002195 | 0.4144588 | 0.3293224 | 0.2015767 | 0.0588791 | 0.1861772 | 0.1889406 | 0.0319581 | -0.0021856 | -0.0153841 | -0.0738191 | -0.1391041 | -0.1803751 | -0.2703219 | -0.2896281 | NA | NA | NA | NA | NA | NA | NA |
| 16932_IPR016932 Uncharacterised conserved protein UCP029669 | 0.3532248 | 0.3487606 | 0.3983893 | 0.4322571 | 0.3433541 | 0.2187559 | 0.0758893 | 0.1797255 | 0.1604750 | 0.0368081 | -0.0009396 | -0.0227065 | -0.0501932 | -0.1053921 | -0.1558642 | -0.2572340 | -0.2965893 | NA | NA | NA | NA | NA | NA | NA |
| 2975_IPR001019 Guanine nucleotide binding protein (G-protein), alpha subunit | 0.3453459 | 0.3408109 | 0.3919455 | 0.4170072 | 0.3312850 | 0.2076272 | 0.0674159 | 0.1791669 | 0.1694537 | 0.0341177 | -0.0014427 | -0.0192392 | -0.0591470 | -0.1175573 | -0.1632756 | -0.2580432 | -0.2883176 | NA | NA | NA | NA | NA | NA | NA |
| 4625_GO:0009058 biosynthetic process | 0.3301929 | 0.3260877 | 0.3714390 | 0.4063069 | 0.3227236 | 0.2069825 | 0.0736433 | 0.1666315 | 0.1450064 | 0.0351531 | -0.0006694 | -0.0224059 | -0.0429058 | -0.0927350 | -0.1413619 | -0.2378520 | -0.2779103 | NA | NA | NA | NA | NA | NA | NA |
| 15815_GO:0016491 oxidoreductase activity | 0.4051042 | 0.3999372 | 0.4575764 | 0.4941961 | 0.3925663 | 0.2491603 | 0.0851637 | 0.2070749 | 0.1875111 | 0.0416982 | -0.0012222 | -0.0252241 | -0.0603459 | -0.1248785 | -0.1817621 | -0.2968221 | -0.3396933 | NA | NA | NA | NA | NA | NA | NA |
| 92_GO:0009058 biosynthetic process | 0.3799115 | 0.3750238 | 0.4297255 | 0.4620741 | 0.3670611 | 0.2321179 | 0.0781882 | 0.1950518 | 0.1789604 | 0.0386421 | -0.0012760 | -0.0229222 | -0.0590879 | -0.1207076 | -0.1731552 | -0.2799848 | -0.3181583 | NA | NA | NA | NA | NA | NA | NA |
| 577_GO:0005975 carbohydrate metabolic process | 0.3668820 | 0.3630482 | 0.4022764 | 0.4754107 | 0.3774238 | 0.2566775 | 0.1107928 | 0.1704070 | 0.1074703 | 0.0470450 | 0.0014950 | -0.0375440 | -0.0046401 | -0.0410270 | -0.1105559 | -0.2363121 | -0.3158646 | NA | NA | NA | NA | NA | NA | NA |
| 14434_IPR014434 Monothiol glutaredoxin | 0.3623264 | 0.3581425 | 0.4029847 | 0.4564130 | 0.3624392 | 0.2388988 | 0.0935846 | 0.1763473 | 0.1354587 | 0.0420959 | 0.0002528 | -0.0301644 | -0.0281034 | -0.0744119 | -0.1346062 | -0.2486648 | -0.3080757 | NA | NA | NA | NA | NA | NA | NA |
| 7325_GO:0006807 nitrogen compound metabolic process | 0.3207744 | 0.3173008 | 0.3534642 | 0.4116606 | 0.3268430 | 0.2199589 | 0.0920288 | 0.1514541 | 0.1029285 | 0.0397983 | 0.0009330 | -0.0307122 | -0.0112476 | -0.0462329 | -0.1044341 | -0.2112872 | -0.2749864 | NA | NA | NA | NA | NA | NA | NA |
| 11537_IPR011537 NADH ubiquinone oxidoreductase, F subunit | 0.3041458 | 0.3010939 | 0.3316753 | 0.3982774 | 0.3161575 | 0.2174231 | 0.0968787 | 0.1387074 | 0.0797747 | 0.0403875 | 0.0016282 | -0.0333210 | 0.0036279 | -0.0232403 | -0.0835720 | -0.1910451 | -0.2630811 | NA | NA | NA | NA | NA | NA | NA |
| 28268_IPR028268 Pianissimo family | -0.2486329 | -0.2460016 | -0.2730953 | -0.3210893 | -0.2549177 | -0.1727308 | -0.0737624 | -0.1161556 | -0.0752783 | -0.0315178 | -0.0009111 | 0.0248664 | 0.0051071 | 0.0306316 | 0.0770440 | 0.1614223 | 0.2137362 | NA | NA | NA | NA | NA | NA | NA |
| 16484_IPR016484 GTP-binding protein EngA | -0.2516543 | -0.2499826 | -0.2621908 | -0.3576461 | -0.2836967 | -0.2112181 | -0.1141417 | -0.0974762 | -0.0030678 | -0.0427861 | -0.0039737 | 0.0424091 | -0.0534871 | -0.0535217 | 0.0145804 | 0.1252613 | 0.2259765 | NA | NA | NA | NA | NA | NA | NA |
| 5815_GO:0016740 transferase activity | 0.0052099 | 0.0015722 | 0.0571048 | -0.1112409 | -0.0874344 | -0.1278369 | -0.1410904 | 0.0750142 | 0.2657496 | -0.0386637 | -0.0110052 | 0.0617619 | -0.2120082 | -0.3059985 | -0.2306526 | -0.1411039 | 0.0303574 | NA | NA | NA | NA | NA | NA | NA |
| 2176_IPR002176 Crossover junction endodeoxyribonuclease RuvC | -0.0230526 | -0.0261403 | 0.0224636 | -0.1394791 | -0.1099149 | -0.1382666 | -0.1394873 | 0.0567283 | 0.2386945 | -0.0394928 | -0.0103368 | 0.0602273 | -0.1965898 | -0.2811346 | -0.2058569 | -0.1131110 | 0.0522139 | NA | NA | NA | NA | NA | NA | NA |
| 5990_GO:0009058 biosynthetic process | -0.0351494 | -0.0381614 | 0.0099265 | -0.1568227 | -0.1236714 | -0.1485889 | -0.1451577 | 0.0521362 | 0.2388872 | -0.0416001 | -0.0105420 | 0.0623460 | -0.1994337 | -0.2841005 | -0.2054509 | -0.1072672 | 0.0631212 | NA | NA | NA | NA | NA | NA | NA |
| 26856_GO:0016787 hydrolase activity | -0.1227731 | -0.1234412 | -0.1066339 | -0.2233385 | -0.1768278 | -0.1574872 | -0.1147571 | -0.0174966 | 0.1079079 | -0.0371595 | -0.0065042 | 0.0464838 | -0.1138515 | -0.1525721 | -0.0877409 | 0.0040744 | 0.1246729 | NA | NA | NA | NA | NA | NA | NA |
| 7318_NA | -0.0845598 | -0.0864456 | -0.0529983 | -0.2007658 | -0.1587069 | -0.1607776 | -0.1357958 | 0.0168300 | 0.1794395 | -0.0412412 | -0.0088665 | 0.0567990 | -0.1627337 | -0.2265898 | -0.1515691 | -0.0519951 | 0.0997301 | NA | NA | NA | NA | NA | NA | NA |
| 30970_IPR030970 Probable phospholipid ABC transporter-binding protein MlaD | -0.1622414 | -0.1631436 | -0.1406379 | -0.2957697 | -0.2341717 | -0.2088215 | -0.1524148 | -0.0227314 | 0.1440168 | -0.0493166 | -0.0086544 | 0.0617617 | -0.1515901 | -0.2032605 | -0.1171778 | 0.0046443 | 0.1649391 | NA | NA | NA | NA | NA | NA | NA |
# I want to save this as a csv:
write.csv(MINT_sPLS_mat.corr_and_LOGOCV_GOs, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/MINT_sPLS_sim_and LOGOCV_stab_scores_GOs.csv", quote = F, row.names = F)
Bray-Curtis similarity within replicates - is Function always more stable regardless of the hierarchical level?
# Taxa - Genus level
GENUS <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame() %>%
rownames_to_column("OTU")) %>%
column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at phylum level
GENUS <- ddply(GENUS, "Genus", numcolwise(sum)) %>%
column_to_rownames("Genus")
# Ready to compute raw abundances per sample - by dividing cell value with column sum
GENUS_RA <- GENUS
for (i in 1:(ncol(GENUS))) { # '2:' as the first column is not numeric
GENUS_RA[i] <- GENUS_RA[i] / sum(GENUS_RA[i])
}
# Checking that rel abunds sum up to 1:
colSums(GENUS_RA)
## 11-049-1_S89_R1 11-049-2_S90_R1 11-049-3_S91_R1
## 1 1 1
## 11-049-4_S92_R1 11-162-1_S81_R1 11-162-2_S82_R1
## 1 1 1
## 11-162-3_S83_R1 11-162-4_S84_R1 13-124-1_S9_R1
## 1 1 1
## 13-124-2_S10_R1 13-124-3_S11_R1 13-124-4_S12_R1
## 1 1 1
## 21-550-1_S69_R1 21-550-2_S70_R1 21-550-3_S71_R1
## 1 1 1
## 21-550-4_S72_R1 21-580-1_S57_R1 21-580-2_S58_R1
## 1 1 1
## 21-580-3_S59_R1 21-580-4_S60_R1 22-084-1_S41_R1
## 1 1 1
## 22-084-2_S42_R1 22-084-3_S43_R1 22-084-4_S44_R1
## 1 1 1
## Agincourt1-1_S33_R1 Agincourt1-2_S34_R1 Agincourt1-3_S35_R1
## 1 1 1
## Agincourt1-4_S36_R1 Arlington-1_S37_R1 Arlington-2_S38_R1
## 1 1 1
## Arlington-3_S39_R1 Arlington-4_S40_R1 Boult-1_S25_R1
## 1 1 1
## Boult-2_S26_R1 Boult-3_S27_R1 Boult-4_S28_R1
## 1 1 1
## Broomfield-1_S49_R1 Broomfield-3_S51_R1 Broomfield-4_S52_R1
## 1 1 1
## Broomfield-rpt-2_S115_R1 Centipede-1_S57_R1 Centipede-2_S58_R1
## 1 1 1
## Centipede-3_S59_R1 Centipede-4_S60_R1 Chicken-1_S69_R1
## 1 1 1
## Chicken-2_S70_R1 Chicken-3_S71_R1 Chicken-4_S72_R1
## 1 1 1
## Chinaman-1_S65_R1 Chinaman-2_S66_R1 Chinaman-3_S67_R1
## 1 1 1
## Chinaman-4_S68_R1 Corbett-1_S17_R1 Corbett-2_S18_R1
## 1 1 1
## Corbett-3_S19_R1 Corbett-4_S20_R1 Davie-1_S1_R1
## 1 1 1
## Davie-2_S2_R1 Davie-3_S3_R1 Davie-4_S4_R1
## 1 1 1
## Erskine-1_S61_R1 Erskine-2_S62_R1 Erskine-3_S63_R1
## 1 1 1
## Erskine-4_S64_R1 Fairfax-1_S33_R1 Fairfax-2_S34_R1
## 1 1 1
## Fairfax-3_S35_R1 Fairfax-4_S36_R1 Farquaharson-1_S1_R1
## 1 1 1
## Farquaharson-2_S2_R1 Farquaharson-3_S3_R1 Farquaharson-4_S4_R1
## 1 1 1
## Feather-1_S5_R1 Feather-2_S6_R1 Feather-3_S7_R1
## 1 1 1
## Feather-4_S8_R1 Fore-and-Aft-1_S77_R1 Fore-and-Aft-2_S78_R1
## 1 1 1
## Fore-and-Aft-3_S79_R1 Fore-and-Aft-4_S80_R1 Fork-1_S49_R1
## 1 1 1
## Fork-2_S50_R1 Fork-3_S51_R1 Fork-4_S52_R1
## 1 1 1
## Grub-1_S65_R1 Grub-2_S66_R1 Grub-3_S67_R1
## 1 1 1
## Grub-4_S68_R1 Hastings-1_S41_R1 Hastings-2_S42_R1
## 1 1 1
## Hastings-3_S43_R1 Hastings-4_S44_R1 Hedley-1_S21_R1
## 1 1 1
## Hedley-2_S22_R1 Hedley-3_S23_R1 Helix-1_S61_R1
## 1 1 1
## Helix-2_S62_R1 Helix-3_S63_R1 Helix-4_S64_R1
## 1 1 1
## Hoskyn-1_S29_R1 Hoskyn-2_S30_R1 Hoskyn-3_S31_R1
## 1 1 1
## Hoskyn-4_S32_R1 JohnBrewer-1_S93_R1 JohnBrewer-2_S94_R1
## 1 1 1
## JohnBrewer-3_S97_R1 JohnBrewer-4_S98_R1 Kelso-1_S85_R1
## 1 1 1
## Kelso-2_S86_R1 Kelso-3_S87_R1 Kelso-4_S88_R1
## 1 1 1
## Knife-1_S45_R1 Knife-2_S46_R1 Knife-3_S47_R1
## 1 1 1
## Knife-4_S48_R1 Lagoon-1_S13_R1 Lagoon-2_S14_R1
## 1 1 1
## Lagoon-3_S15_R1 Lagoon-4_S16_R1 LittleKelso-1_S81_R1
## 1 1 1
## LittleKelso-2_S82_R1 LittleKelso-3_S83_R1 LittleKelso-4_S84_R1
## 1 1 1
## Lynchs-1_S99_R1 Lynchs-2_S100_R1 Lynchs-3_S101_R1
## 1 1 1
## Lynchs-4_S102_R1 Mantis-1_S85_R1 Mantis-2_S86_R1
## 1 1 1
## Mantis-3_S87_R1 Mantis-4_S88_R1 Masthead-1_S53_R1
## 1 1 1
## Masthead-2_S54_R1 Masthead-3_S55_R1 Masthead-4_S56_R1
## 1 1 1
## McCulloch-1_S17_R1 McCulloch-2_S18_R1 McCulloch-3_S19_R1
## 1 1 1
## McCulloch-4_S20_R1 McSweeney-1_S5_R1 McSweeney-2_S6_R1
## 1 1 1
## McSweeney-3_S7_R1 McSweeney-4_S8_R1 Monsoon-1_S21_R1
## 1 1 1
## Monsoon-2_S22_R1 Monsoon-3_S23_R1 Monsoon-4_S24_R1
## 1 1 1
## Moore-1_S25_R1 Moore-2_S26_R1 Moore-3_S27_R1
## 1 1 1
## Moore-4_S28_R1 Myrmidon-1_S53_R1 Myrmidon-2_S54_R1
## 1 1 1
## Myrmidon-3_S55_R1 Myrmidon-4_S56_R1 North-1_S37_R1
## 1 1 1
## North-2_S38_R1 North-3_S39_R1 North-4_S40_R1
## 1 1 1
## Peart-1_S13_R1 Peart-2_S14_R1 Peart-3_S15_R1
## 1 1 1
## Peart-4_S16_R1 Rib-1_S73_R1 Rib-2_S74_R1
## 1 1 1
## Rib-3_S75_R1 Rib-4_S76_R1 Roxburgh-1_S89_R1
## 1 1 1
## Roxburgh-2_S90_R1 Roxburgh-3_S91_R1 Roxburgh-4_S92_R1
## 1 1 1
## Sanbank1-1_S77_R1 Sanbank1-2_S78_R1 Sanbank1-3_S79_R1
## 1 1 1
## Sanbank1-4_S80_R1 SmallLagoon-1_S45_R1 SmallLagoon-2_S46_R1
## 1 1 1
## SmallLagoon-3_S47_R1 SmallLagoon-4_S48_R1 St-Crispin-1_S73_R1
## 1 1 1
## St-Crispin-2_S74_R1 St-Crispin-3_S75_R1 St-Crispin-4_S76_R1
## 1 1 1
## Taylor-1_S9_R1 Taylor-2_S10_R1 Taylor-3_S11_R1
## 1 1 1
## Taylor-4_S12_R1 Thetford-1_S29_R1 Thetford-2_S30_R1
## 1 1 1
## Thetford-3_S31_R1 Thetford-4_S32_R1
## 1 1
bray_curtis_genus <- vegdist(t(GENUS_RA), # needs transposing
method = "bray", # I am computing Bray Curtis dissimilarity
diag = F, # diagonals will be 0 as it's on the same samples
upper = TRUE) %>% # Only the upper bit of the matrix
as.matrix() %>% # Output as matrix
reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
left_join(., data.frame(sample_data(megan_genus_clr)) %>%
rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
left_join(., data.frame(sample_data(megan_genus_clr)) %>%
rownames_to_column("Var2")) %>% # Now merging based on Var2
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_genus <- dplyr::filter(bray_curtis_genus, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level!
dplyr::filter(value != 0) %>% # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
mutate(Bray_Curtis_similarity = 1-value) %>% # computing Bray Curtis similarity as 1 - BC dissimilarity
left_join(., data.frame(sample_data(megan_genus_clr))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_genus_median <- round(median(bray_curtis_genus$Bray_Curtis_similarity), digits = 2)
bray_curtis_genus_mean <- round(mean(bray_curtis_genus$Bray_Curtis_similarity), digits = 2)
bray_curtis_genus_SD <- round(sd(bray_curtis_genus$Bray_Curtis_similarity), digits = 2)
bray_curtis_genus_minimum <- round(min(bray_curtis_genus$Bray_Curtis_similarity), digits = 2)
# Plotting as bocplots:
bray_curtis_genus_boxplots <- bray_curtis_genus %>%
ggplot(aes(x = "Within replicate for taxa (Genus level)",
y = Bray_Curtis_similarity,
# col = Sampling_trip
)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
stat_summary(aes(label = paste("Median:", bray_curtis_genus_median,
"\nMean:", bray_curtis_genus_mean,
"\nSD:", bray_curtis_genus_SD,
"\nMin:", bray_curtis_genus_minimum)),
fun.y = median,
geom = "text",
color = "black"
) + # median and SD as text
stat_summary(fun=mean,
geom="point",
shape=20,
size=1.5,
color="seagreen1",
fill="seagreen1") + # Plotting the mean as a green dot!
scale_color_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
) +
ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
ylab("Bray-Curtis similarity") +
theme(legend.position = "none")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Taxa - Family level
FAMILY <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame() %>%
rownames_to_column("OTU")) %>%
column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at phylum level
FAMILY <- ddply(FAMILY, "Family", numcolwise(sum)) %>%
column_to_rownames("Family")
# Ready to compute raw abundances per sample - by dividing cell value with column sum
FAMILY_RA <- FAMILY
for (i in 1:(ncol(FAMILY))) { # '2:' as the first column is not numeric
FAMILY_RA[i] <- FAMILY_RA[i] / sum(FAMILY_RA[i])
}
# Checking that rel abunds sum up to 1:
colSums(FAMILY_RA)
## 11-049-1_S89_R1 11-049-2_S90_R1 11-049-3_S91_R1
## 1 1 1
## 11-049-4_S92_R1 11-162-1_S81_R1 11-162-2_S82_R1
## 1 1 1
## 11-162-3_S83_R1 11-162-4_S84_R1 13-124-1_S9_R1
## 1 1 1
## 13-124-2_S10_R1 13-124-3_S11_R1 13-124-4_S12_R1
## 1 1 1
## 21-550-1_S69_R1 21-550-2_S70_R1 21-550-3_S71_R1
## 1 1 1
## 21-550-4_S72_R1 21-580-1_S57_R1 21-580-2_S58_R1
## 1 1 1
## 21-580-3_S59_R1 21-580-4_S60_R1 22-084-1_S41_R1
## 1 1 1
## 22-084-2_S42_R1 22-084-3_S43_R1 22-084-4_S44_R1
## 1 1 1
## Agincourt1-1_S33_R1 Agincourt1-2_S34_R1 Agincourt1-3_S35_R1
## 1 1 1
## Agincourt1-4_S36_R1 Arlington-1_S37_R1 Arlington-2_S38_R1
## 1 1 1
## Arlington-3_S39_R1 Arlington-4_S40_R1 Boult-1_S25_R1
## 1 1 1
## Boult-2_S26_R1 Boult-3_S27_R1 Boult-4_S28_R1
## 1 1 1
## Broomfield-1_S49_R1 Broomfield-3_S51_R1 Broomfield-4_S52_R1
## 1 1 1
## Broomfield-rpt-2_S115_R1 Centipede-1_S57_R1 Centipede-2_S58_R1
## 1 1 1
## Centipede-3_S59_R1 Centipede-4_S60_R1 Chicken-1_S69_R1
## 1 1 1
## Chicken-2_S70_R1 Chicken-3_S71_R1 Chicken-4_S72_R1
## 1 1 1
## Chinaman-1_S65_R1 Chinaman-2_S66_R1 Chinaman-3_S67_R1
## 1 1 1
## Chinaman-4_S68_R1 Corbett-1_S17_R1 Corbett-2_S18_R1
## 1 1 1
## Corbett-3_S19_R1 Corbett-4_S20_R1 Davie-1_S1_R1
## 1 1 1
## Davie-2_S2_R1 Davie-3_S3_R1 Davie-4_S4_R1
## 1 1 1
## Erskine-1_S61_R1 Erskine-2_S62_R1 Erskine-3_S63_R1
## 1 1 1
## Erskine-4_S64_R1 Fairfax-1_S33_R1 Fairfax-2_S34_R1
## 1 1 1
## Fairfax-3_S35_R1 Fairfax-4_S36_R1 Farquaharson-1_S1_R1
## 1 1 1
## Farquaharson-2_S2_R1 Farquaharson-3_S3_R1 Farquaharson-4_S4_R1
## 1 1 1
## Feather-1_S5_R1 Feather-2_S6_R1 Feather-3_S7_R1
## 1 1 1
## Feather-4_S8_R1 Fore-and-Aft-1_S77_R1 Fore-and-Aft-2_S78_R1
## 1 1 1
## Fore-and-Aft-3_S79_R1 Fore-and-Aft-4_S80_R1 Fork-1_S49_R1
## 1 1 1
## Fork-2_S50_R1 Fork-3_S51_R1 Fork-4_S52_R1
## 1 1 1
## Grub-1_S65_R1 Grub-2_S66_R1 Grub-3_S67_R1
## 1 1 1
## Grub-4_S68_R1 Hastings-1_S41_R1 Hastings-2_S42_R1
## 1 1 1
## Hastings-3_S43_R1 Hastings-4_S44_R1 Hedley-1_S21_R1
## 1 1 1
## Hedley-2_S22_R1 Hedley-3_S23_R1 Helix-1_S61_R1
## 1 1 1
## Helix-2_S62_R1 Helix-3_S63_R1 Helix-4_S64_R1
## 1 1 1
## Hoskyn-1_S29_R1 Hoskyn-2_S30_R1 Hoskyn-3_S31_R1
## 1 1 1
## Hoskyn-4_S32_R1 JohnBrewer-1_S93_R1 JohnBrewer-2_S94_R1
## 1 1 1
## JohnBrewer-3_S97_R1 JohnBrewer-4_S98_R1 Kelso-1_S85_R1
## 1 1 1
## Kelso-2_S86_R1 Kelso-3_S87_R1 Kelso-4_S88_R1
## 1 1 1
## Knife-1_S45_R1 Knife-2_S46_R1 Knife-3_S47_R1
## 1 1 1
## Knife-4_S48_R1 Lagoon-1_S13_R1 Lagoon-2_S14_R1
## 1 1 1
## Lagoon-3_S15_R1 Lagoon-4_S16_R1 LittleKelso-1_S81_R1
## 1 1 1
## LittleKelso-2_S82_R1 LittleKelso-3_S83_R1 LittleKelso-4_S84_R1
## 1 1 1
## Lynchs-1_S99_R1 Lynchs-2_S100_R1 Lynchs-3_S101_R1
## 1 1 1
## Lynchs-4_S102_R1 Mantis-1_S85_R1 Mantis-2_S86_R1
## 1 1 1
## Mantis-3_S87_R1 Mantis-4_S88_R1 Masthead-1_S53_R1
## 1 1 1
## Masthead-2_S54_R1 Masthead-3_S55_R1 Masthead-4_S56_R1
## 1 1 1
## McCulloch-1_S17_R1 McCulloch-2_S18_R1 McCulloch-3_S19_R1
## 1 1 1
## McCulloch-4_S20_R1 McSweeney-1_S5_R1 McSweeney-2_S6_R1
## 1 1 1
## McSweeney-3_S7_R1 McSweeney-4_S8_R1 Monsoon-1_S21_R1
## 1 1 1
## Monsoon-2_S22_R1 Monsoon-3_S23_R1 Monsoon-4_S24_R1
## 1 1 1
## Moore-1_S25_R1 Moore-2_S26_R1 Moore-3_S27_R1
## 1 1 1
## Moore-4_S28_R1 Myrmidon-1_S53_R1 Myrmidon-2_S54_R1
## 1 1 1
## Myrmidon-3_S55_R1 Myrmidon-4_S56_R1 North-1_S37_R1
## 1 1 1
## North-2_S38_R1 North-3_S39_R1 North-4_S40_R1
## 1 1 1
## Peart-1_S13_R1 Peart-2_S14_R1 Peart-3_S15_R1
## 1 1 1
## Peart-4_S16_R1 Rib-1_S73_R1 Rib-2_S74_R1
## 1 1 1
## Rib-3_S75_R1 Rib-4_S76_R1 Roxburgh-1_S89_R1
## 1 1 1
## Roxburgh-2_S90_R1 Roxburgh-3_S91_R1 Roxburgh-4_S92_R1
## 1 1 1
## Sanbank1-1_S77_R1 Sanbank1-2_S78_R1 Sanbank1-3_S79_R1
## 1 1 1
## Sanbank1-4_S80_R1 SmallLagoon-1_S45_R1 SmallLagoon-2_S46_R1
## 1 1 1
## SmallLagoon-3_S47_R1 SmallLagoon-4_S48_R1 St-Crispin-1_S73_R1
## 1 1 1
## St-Crispin-2_S74_R1 St-Crispin-3_S75_R1 St-Crispin-4_S76_R1
## 1 1 1
## Taylor-1_S9_R1 Taylor-2_S10_R1 Taylor-3_S11_R1
## 1 1 1
## Taylor-4_S12_R1 Thetford-1_S29_R1 Thetford-2_S30_R1
## 1 1 1
## Thetford-3_S31_R1 Thetford-4_S32_R1
## 1 1
bray_curtis_family <- vegdist(t(FAMILY_RA), # needs transposing
method = "bray", # I am computing Bray Curtis dissimilarity
diag = F, # diagonals will be 0 as it's on the same samples
upper = TRUE) %>% # Only the upper bit of the matrix
as.matrix() %>% # Output as matrix
reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
left_join(., data.frame(sample_data(megan_genus_clr)) %>%
rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
left_join(., data.frame(sample_data(megan_genus_clr)) %>%
rownames_to_column("Var2")) %>% # Now merging based on Var2
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_family <- dplyr::filter(bray_curtis_family, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level!
dplyr::filter(value != 0) %>% # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
mutate(Bray_Curtis_similarity = 1-value) %>% # computing Bray Curtis similarity as 1 - BC dissimilarity
left_join(., data.frame(sample_data(megan_genus_clr))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_family_median <- round(median(bray_curtis_family$Bray_Curtis_similarity), digits = 2)
bray_curtis_family_mean <- round(mean(bray_curtis_family$Bray_Curtis_similarity), digits = 2)
bray_curtis_family_SD <- round(sd(bray_curtis_family$Bray_Curtis_similarity), digits = 2)
bray_curtis_family_minimum <- round(min(bray_curtis_family$Bray_Curtis_similarity), digits = 2)
# Plotting as bocplots:
bray_curtis_family_boxplots <- bray_curtis_family %>%
ggplot(aes(x = "Within replicate for taxa (family level)",
y = Bray_Curtis_similarity,
# col = Sampling_trip
)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
stat_summary(aes(label = paste("Median:", bray_curtis_family_median,
"\nMean:", bray_curtis_family_mean,
"\nSD:", bray_curtis_family_SD,
"\nMin:", bray_curtis_family_minimum)),
fun.y = median,
geom = "text",
color = "black"
) + # median and SD as text
stat_summary(fun=mean,
geom="point",
shape=20,
size=1.5,
color="seagreen1",
fill="seagreen1") + # Plotting the mean as a green dot!
scale_color_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
) +
ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
ylab("Bray-Curtis similarity") +
theme(legend.position = "none")
# Taxa - Order level
ORDER <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame() %>%
rownames_to_column("OTU")) %>%
column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at phylum level
ORDER <- ddply(ORDER, "Order", numcolwise(sum)) %>%
column_to_rownames("Order")
# Ready to compute raw abundances per sample - by dividing cell value with column sum
ORDER_RA <- ORDER
for (i in 1:(ncol(ORDER))) { # '2:' as the first column is not numeric
ORDER_RA[i] <- ORDER_RA[i] / sum(ORDER_RA[i])
}
# Checking that rel abunds sum up to 1:
colSums(ORDER_RA)
## 11-049-1_S89_R1 11-049-2_S90_R1 11-049-3_S91_R1
## 1 1 1
## 11-049-4_S92_R1 11-162-1_S81_R1 11-162-2_S82_R1
## 1 1 1
## 11-162-3_S83_R1 11-162-4_S84_R1 13-124-1_S9_R1
## 1 1 1
## 13-124-2_S10_R1 13-124-3_S11_R1 13-124-4_S12_R1
## 1 1 1
## 21-550-1_S69_R1 21-550-2_S70_R1 21-550-3_S71_R1
## 1 1 1
## 21-550-4_S72_R1 21-580-1_S57_R1 21-580-2_S58_R1
## 1 1 1
## 21-580-3_S59_R1 21-580-4_S60_R1 22-084-1_S41_R1
## 1 1 1
## 22-084-2_S42_R1 22-084-3_S43_R1 22-084-4_S44_R1
## 1 1 1
## Agincourt1-1_S33_R1 Agincourt1-2_S34_R1 Agincourt1-3_S35_R1
## 1 1 1
## Agincourt1-4_S36_R1 Arlington-1_S37_R1 Arlington-2_S38_R1
## 1 1 1
## Arlington-3_S39_R1 Arlington-4_S40_R1 Boult-1_S25_R1
## 1 1 1
## Boult-2_S26_R1 Boult-3_S27_R1 Boult-4_S28_R1
## 1 1 1
## Broomfield-1_S49_R1 Broomfield-3_S51_R1 Broomfield-4_S52_R1
## 1 1 1
## Broomfield-rpt-2_S115_R1 Centipede-1_S57_R1 Centipede-2_S58_R1
## 1 1 1
## Centipede-3_S59_R1 Centipede-4_S60_R1 Chicken-1_S69_R1
## 1 1 1
## Chicken-2_S70_R1 Chicken-3_S71_R1 Chicken-4_S72_R1
## 1 1 1
## Chinaman-1_S65_R1 Chinaman-2_S66_R1 Chinaman-3_S67_R1
## 1 1 1
## Chinaman-4_S68_R1 Corbett-1_S17_R1 Corbett-2_S18_R1
## 1 1 1
## Corbett-3_S19_R1 Corbett-4_S20_R1 Davie-1_S1_R1
## 1 1 1
## Davie-2_S2_R1 Davie-3_S3_R1 Davie-4_S4_R1
## 1 1 1
## Erskine-1_S61_R1 Erskine-2_S62_R1 Erskine-3_S63_R1
## 1 1 1
## Erskine-4_S64_R1 Fairfax-1_S33_R1 Fairfax-2_S34_R1
## 1 1 1
## Fairfax-3_S35_R1 Fairfax-4_S36_R1 Farquaharson-1_S1_R1
## 1 1 1
## Farquaharson-2_S2_R1 Farquaharson-3_S3_R1 Farquaharson-4_S4_R1
## 1 1 1
## Feather-1_S5_R1 Feather-2_S6_R1 Feather-3_S7_R1
## 1 1 1
## Feather-4_S8_R1 Fore-and-Aft-1_S77_R1 Fore-and-Aft-2_S78_R1
## 1 1 1
## Fore-and-Aft-3_S79_R1 Fore-and-Aft-4_S80_R1 Fork-1_S49_R1
## 1 1 1
## Fork-2_S50_R1 Fork-3_S51_R1 Fork-4_S52_R1
## 1 1 1
## Grub-1_S65_R1 Grub-2_S66_R1 Grub-3_S67_R1
## 1 1 1
## Grub-4_S68_R1 Hastings-1_S41_R1 Hastings-2_S42_R1
## 1 1 1
## Hastings-3_S43_R1 Hastings-4_S44_R1 Hedley-1_S21_R1
## 1 1 1
## Hedley-2_S22_R1 Hedley-3_S23_R1 Helix-1_S61_R1
## 1 1 1
## Helix-2_S62_R1 Helix-3_S63_R1 Helix-4_S64_R1
## 1 1 1
## Hoskyn-1_S29_R1 Hoskyn-2_S30_R1 Hoskyn-3_S31_R1
## 1 1 1
## Hoskyn-4_S32_R1 JohnBrewer-1_S93_R1 JohnBrewer-2_S94_R1
## 1 1 1
## JohnBrewer-3_S97_R1 JohnBrewer-4_S98_R1 Kelso-1_S85_R1
## 1 1 1
## Kelso-2_S86_R1 Kelso-3_S87_R1 Kelso-4_S88_R1
## 1 1 1
## Knife-1_S45_R1 Knife-2_S46_R1 Knife-3_S47_R1
## 1 1 1
## Knife-4_S48_R1 Lagoon-1_S13_R1 Lagoon-2_S14_R1
## 1 1 1
## Lagoon-3_S15_R1 Lagoon-4_S16_R1 LittleKelso-1_S81_R1
## 1 1 1
## LittleKelso-2_S82_R1 LittleKelso-3_S83_R1 LittleKelso-4_S84_R1
## 1 1 1
## Lynchs-1_S99_R1 Lynchs-2_S100_R1 Lynchs-3_S101_R1
## 1 1 1
## Lynchs-4_S102_R1 Mantis-1_S85_R1 Mantis-2_S86_R1
## 1 1 1
## Mantis-3_S87_R1 Mantis-4_S88_R1 Masthead-1_S53_R1
## 1 1 1
## Masthead-2_S54_R1 Masthead-3_S55_R1 Masthead-4_S56_R1
## 1 1 1
## McCulloch-1_S17_R1 McCulloch-2_S18_R1 McCulloch-3_S19_R1
## 1 1 1
## McCulloch-4_S20_R1 McSweeney-1_S5_R1 McSweeney-2_S6_R1
## 1 1 1
## McSweeney-3_S7_R1 McSweeney-4_S8_R1 Monsoon-1_S21_R1
## 1 1 1
## Monsoon-2_S22_R1 Monsoon-3_S23_R1 Monsoon-4_S24_R1
## 1 1 1
## Moore-1_S25_R1 Moore-2_S26_R1 Moore-3_S27_R1
## 1 1 1
## Moore-4_S28_R1 Myrmidon-1_S53_R1 Myrmidon-2_S54_R1
## 1 1 1
## Myrmidon-3_S55_R1 Myrmidon-4_S56_R1 North-1_S37_R1
## 1 1 1
## North-2_S38_R1 North-3_S39_R1 North-4_S40_R1
## 1 1 1
## Peart-1_S13_R1 Peart-2_S14_R1 Peart-3_S15_R1
## 1 1 1
## Peart-4_S16_R1 Rib-1_S73_R1 Rib-2_S74_R1
## 1 1 1
## Rib-3_S75_R1 Rib-4_S76_R1 Roxburgh-1_S89_R1
## 1 1 1
## Roxburgh-2_S90_R1 Roxburgh-3_S91_R1 Roxburgh-4_S92_R1
## 1 1 1
## Sanbank1-1_S77_R1 Sanbank1-2_S78_R1 Sanbank1-3_S79_R1
## 1 1 1
## Sanbank1-4_S80_R1 SmallLagoon-1_S45_R1 SmallLagoon-2_S46_R1
## 1 1 1
## SmallLagoon-3_S47_R1 SmallLagoon-4_S48_R1 St-Crispin-1_S73_R1
## 1 1 1
## St-Crispin-2_S74_R1 St-Crispin-3_S75_R1 St-Crispin-4_S76_R1
## 1 1 1
## Taylor-1_S9_R1 Taylor-2_S10_R1 Taylor-3_S11_R1
## 1 1 1
## Taylor-4_S12_R1 Thetford-1_S29_R1 Thetford-2_S30_R1
## 1 1 1
## Thetford-3_S31_R1 Thetford-4_S32_R1
## 1 1
bray_curtis_order <- vegdist(t(ORDER_RA), # needs transposing
method = "bray", # I am computing Bray Curtis dissimilarity
diag = F, # diagonals will be 0 as it's on the same samples
upper = TRUE) %>% # Only the upper bit of the matrix
as.matrix() %>% # Output as matrix
reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
left_join(., data.frame(sample_data(megan_genus_clr)) %>%
rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
left_join(., data.frame(sample_data(megan_genus_clr)) %>%
rownames_to_column("Var2")) %>% # Now merging based on Var2
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_order <- dplyr::filter(bray_curtis_order, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level!
dplyr::filter(value != 0) %>% # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
mutate(Bray_Curtis_similarity = 1-value) %>% # computing Bray Curtis similarity as 1 - BC dissimilarity
left_join(., data.frame(sample_data(megan_genus_clr))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_order_median <- round(median(bray_curtis_order$Bray_Curtis_similarity), digits = 2)
bray_curtis_order_mean <- round(mean(bray_curtis_order$Bray_Curtis_similarity), digits = 2)
bray_curtis_order_SD <- round(sd(bray_curtis_order$Bray_Curtis_similarity), digits = 2)
bray_curtis_order_minimum <- round(min(bray_curtis_order$Bray_Curtis_similarity), digits = 2)
# Plotting as bocplots:
bray_curtis_order_boxplots <- bray_curtis_order %>%
ggplot(aes(x = "Within replicate for taxa (Order level)",
y = Bray_Curtis_similarity,
# col = Sampling_trip
)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
stat_summary(aes(label = paste("Median:", bray_curtis_order_median,
"\nMean:", bray_curtis_order_mean,
"\nSD:", bray_curtis_order_SD,
"\nMin:", bray_curtis_order_minimum)),
fun.y = median,
geom = "text",
color = "black"
) + # median and SD as text
stat_summary(fun=mean,
geom="point",
shape=20,
size=1.5,
color="seagreen1",
fill="seagreen1") + # Plotting the mean as a green dot!
scale_color_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
) +
ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
ylab("Bray-Curtis similarity") +
theme(legend.position = "none")
# Taxa - Class level
CLASS <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame() %>%
rownames_to_column("OTU")) %>%
column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at class level
CLASS <- ddply(CLASS, "Class", numcolwise(sum)) %>%
column_to_rownames("Class")
# Ready to compute raw abundances per sample - by dividing cell value with column sum
CLASS_RA <- CLASS
for (i in 1:(ncol(CLASS))) { # '2:' as the first column is not numeric
CLASS_RA[i] <- CLASS_RA[i] / sum(CLASS_RA[i])
}
# Checking that rel abunds sum up to 1:
colSums(CLASS_RA)
## 11-049-1_S89_R1 11-049-2_S90_R1 11-049-3_S91_R1
## 1 1 1
## 11-049-4_S92_R1 11-162-1_S81_R1 11-162-2_S82_R1
## 1 1 1
## 11-162-3_S83_R1 11-162-4_S84_R1 13-124-1_S9_R1
## 1 1 1
## 13-124-2_S10_R1 13-124-3_S11_R1 13-124-4_S12_R1
## 1 1 1
## 21-550-1_S69_R1 21-550-2_S70_R1 21-550-3_S71_R1
## 1 1 1
## 21-550-4_S72_R1 21-580-1_S57_R1 21-580-2_S58_R1
## 1 1 1
## 21-580-3_S59_R1 21-580-4_S60_R1 22-084-1_S41_R1
## 1 1 1
## 22-084-2_S42_R1 22-084-3_S43_R1 22-084-4_S44_R1
## 1 1 1
## Agincourt1-1_S33_R1 Agincourt1-2_S34_R1 Agincourt1-3_S35_R1
## 1 1 1
## Agincourt1-4_S36_R1 Arlington-1_S37_R1 Arlington-2_S38_R1
## 1 1 1
## Arlington-3_S39_R1 Arlington-4_S40_R1 Boult-1_S25_R1
## 1 1 1
## Boult-2_S26_R1 Boult-3_S27_R1 Boult-4_S28_R1
## 1 1 1
## Broomfield-1_S49_R1 Broomfield-3_S51_R1 Broomfield-4_S52_R1
## 1 1 1
## Broomfield-rpt-2_S115_R1 Centipede-1_S57_R1 Centipede-2_S58_R1
## 1 1 1
## Centipede-3_S59_R1 Centipede-4_S60_R1 Chicken-1_S69_R1
## 1 1 1
## Chicken-2_S70_R1 Chicken-3_S71_R1 Chicken-4_S72_R1
## 1 1 1
## Chinaman-1_S65_R1 Chinaman-2_S66_R1 Chinaman-3_S67_R1
## 1 1 1
## Chinaman-4_S68_R1 Corbett-1_S17_R1 Corbett-2_S18_R1
## 1 1 1
## Corbett-3_S19_R1 Corbett-4_S20_R1 Davie-1_S1_R1
## 1 1 1
## Davie-2_S2_R1 Davie-3_S3_R1 Davie-4_S4_R1
## 1 1 1
## Erskine-1_S61_R1 Erskine-2_S62_R1 Erskine-3_S63_R1
## 1 1 1
## Erskine-4_S64_R1 Fairfax-1_S33_R1 Fairfax-2_S34_R1
## 1 1 1
## Fairfax-3_S35_R1 Fairfax-4_S36_R1 Farquaharson-1_S1_R1
## 1 1 1
## Farquaharson-2_S2_R1 Farquaharson-3_S3_R1 Farquaharson-4_S4_R1
## 1 1 1
## Feather-1_S5_R1 Feather-2_S6_R1 Feather-3_S7_R1
## 1 1 1
## Feather-4_S8_R1 Fore-and-Aft-1_S77_R1 Fore-and-Aft-2_S78_R1
## 1 1 1
## Fore-and-Aft-3_S79_R1 Fore-and-Aft-4_S80_R1 Fork-1_S49_R1
## 1 1 1
## Fork-2_S50_R1 Fork-3_S51_R1 Fork-4_S52_R1
## 1 1 1
## Grub-1_S65_R1 Grub-2_S66_R1 Grub-3_S67_R1
## 1 1 1
## Grub-4_S68_R1 Hastings-1_S41_R1 Hastings-2_S42_R1
## 1 1 1
## Hastings-3_S43_R1 Hastings-4_S44_R1 Hedley-1_S21_R1
## 1 1 1
## Hedley-2_S22_R1 Hedley-3_S23_R1 Helix-1_S61_R1
## 1 1 1
## Helix-2_S62_R1 Helix-3_S63_R1 Helix-4_S64_R1
## 1 1 1
## Hoskyn-1_S29_R1 Hoskyn-2_S30_R1 Hoskyn-3_S31_R1
## 1 1 1
## Hoskyn-4_S32_R1 JohnBrewer-1_S93_R1 JohnBrewer-2_S94_R1
## 1 1 1
## JohnBrewer-3_S97_R1 JohnBrewer-4_S98_R1 Kelso-1_S85_R1
## 1 1 1
## Kelso-2_S86_R1 Kelso-3_S87_R1 Kelso-4_S88_R1
## 1 1 1
## Knife-1_S45_R1 Knife-2_S46_R1 Knife-3_S47_R1
## 1 1 1
## Knife-4_S48_R1 Lagoon-1_S13_R1 Lagoon-2_S14_R1
## 1 1 1
## Lagoon-3_S15_R1 Lagoon-4_S16_R1 LittleKelso-1_S81_R1
## 1 1 1
## LittleKelso-2_S82_R1 LittleKelso-3_S83_R1 LittleKelso-4_S84_R1
## 1 1 1
## Lynchs-1_S99_R1 Lynchs-2_S100_R1 Lynchs-3_S101_R1
## 1 1 1
## Lynchs-4_S102_R1 Mantis-1_S85_R1 Mantis-2_S86_R1
## 1 1 1
## Mantis-3_S87_R1 Mantis-4_S88_R1 Masthead-1_S53_R1
## 1 1 1
## Masthead-2_S54_R1 Masthead-3_S55_R1 Masthead-4_S56_R1
## 1 1 1
## McCulloch-1_S17_R1 McCulloch-2_S18_R1 McCulloch-3_S19_R1
## 1 1 1
## McCulloch-4_S20_R1 McSweeney-1_S5_R1 McSweeney-2_S6_R1
## 1 1 1
## McSweeney-3_S7_R1 McSweeney-4_S8_R1 Monsoon-1_S21_R1
## 1 1 1
## Monsoon-2_S22_R1 Monsoon-3_S23_R1 Monsoon-4_S24_R1
## 1 1 1
## Moore-1_S25_R1 Moore-2_S26_R1 Moore-3_S27_R1
## 1 1 1
## Moore-4_S28_R1 Myrmidon-1_S53_R1 Myrmidon-2_S54_R1
## 1 1 1
## Myrmidon-3_S55_R1 Myrmidon-4_S56_R1 North-1_S37_R1
## 1 1 1
## North-2_S38_R1 North-3_S39_R1 North-4_S40_R1
## 1 1 1
## Peart-1_S13_R1 Peart-2_S14_R1 Peart-3_S15_R1
## 1 1 1
## Peart-4_S16_R1 Rib-1_S73_R1 Rib-2_S74_R1
## 1 1 1
## Rib-3_S75_R1 Rib-4_S76_R1 Roxburgh-1_S89_R1
## 1 1 1
## Roxburgh-2_S90_R1 Roxburgh-3_S91_R1 Roxburgh-4_S92_R1
## 1 1 1
## Sanbank1-1_S77_R1 Sanbank1-2_S78_R1 Sanbank1-3_S79_R1
## 1 1 1
## Sanbank1-4_S80_R1 SmallLagoon-1_S45_R1 SmallLagoon-2_S46_R1
## 1 1 1
## SmallLagoon-3_S47_R1 SmallLagoon-4_S48_R1 St-Crispin-1_S73_R1
## 1 1 1
## St-Crispin-2_S74_R1 St-Crispin-3_S75_R1 St-Crispin-4_S76_R1
## 1 1 1
## Taylor-1_S9_R1 Taylor-2_S10_R1 Taylor-3_S11_R1
## 1 1 1
## Taylor-4_S12_R1 Thetford-1_S29_R1 Thetford-2_S30_R1
## 1 1 1
## Thetford-3_S31_R1 Thetford-4_S32_R1
## 1 1
bray_curtis_class <- vegdist(t(CLASS_RA), # needs transposing
method = "bray", # I am computing Bray Curtis dissimilarity
diag = F, # diagonals will be 0 as it's on the same samples
upper = TRUE) %>% # Only the upper bit of the matrix
as.matrix() %>% # Output as matrix
reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
left_join(., data.frame(sample_data(megan_genus_clr)) %>%
rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
left_join(., data.frame(sample_data(megan_genus_clr)) %>%
rownames_to_column("Var2")) %>% # Now merging based on Var2
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_class <- dplyr::filter(bray_curtis_class, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level!
dplyr::filter(value != 0) %>% # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
mutate(Bray_Curtis_similarity = 1-value) %>% # computing Bray Curtis similarity as 1 - BC dissimilarity
left_join(., data.frame(sample_data(megan_genus_clr))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_class_median <- round(median(bray_curtis_class$Bray_Curtis_similarity), digits = 2)
bray_curtis_class_SD <- round(sd(bray_curtis_class$Bray_Curtis_similarity), digits = 2)
bray_curtis_class_minimum <- round(min(bray_curtis_class$Bray_Curtis_similarity), digits = 2)
# Plotting as bocplots:
bray_curtis_class_boxplots <- bray_curtis_class %>%
ggplot(aes(x = "Within replicate for taxa (Class level)",
y = Bray_Curtis_similarity,
# col = Sampling_trip
)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
stat_summary(aes(label = paste("Median:", bray_curtis_class_median,
"\nSD:", bray_curtis_class_SD,
"\nMin:", bray_curtis_class_minimum)),
fun.y = median,
geom = "text",
color = "black"
) + # median and SD as text
stat_summary(fun=mean,
geom="point",
shape=20,
size=1.5,
color="seagreen1",
fill="seagreen1") + # Plotting the mean as a green dot!
scale_color_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
) +
ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
ylab("Bray-Curtis similarity") +
theme(legend.position = "none")
# Taxa - PHYLUM level
PHYLUM <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame %>%
rownames_to_column("OTU"),
tax_table(megan_genus_abundant_known_phyla_only) %>%
as.data.frame() %>%
rownames_to_column("OTU")) %>%
column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at PHYLUM level
PHYLUM <- ddply(PHYLUM, "Phylum" , numcolwise(sum)) %>%
column_to_rownames("Phylum" )
# Ready to compute raw abundances per sample - by dividing cell value with column sum
PHYLUM_RA <- PHYLUM
for (i in 1:(ncol(PHYLUM))) { # '2:' as the first column is not numeric
PHYLUM_RA[i] <- PHYLUM_RA[i] / sum(PHYLUM_RA[i])
}
# Checking that rel abunds sum up to 1:
colSums(PHYLUM_RA)
## 11-049-1_S89_R1 11-049-2_S90_R1 11-049-3_S91_R1
## 1 1 1
## 11-049-4_S92_R1 11-162-1_S81_R1 11-162-2_S82_R1
## 1 1 1
## 11-162-3_S83_R1 11-162-4_S84_R1 13-124-1_S9_R1
## 1 1 1
## 13-124-2_S10_R1 13-124-3_S11_R1 13-124-4_S12_R1
## 1 1 1
## 21-550-1_S69_R1 21-550-2_S70_R1 21-550-3_S71_R1
## 1 1 1
## 21-550-4_S72_R1 21-580-1_S57_R1 21-580-2_S58_R1
## 1 1 1
## 21-580-3_S59_R1 21-580-4_S60_R1 22-084-1_S41_R1
## 1 1 1
## 22-084-2_S42_R1 22-084-3_S43_R1 22-084-4_S44_R1
## 1 1 1
## Agincourt1-1_S33_R1 Agincourt1-2_S34_R1 Agincourt1-3_S35_R1
## 1 1 1
## Agincourt1-4_S36_R1 Arlington-1_S37_R1 Arlington-2_S38_R1
## 1 1 1
## Arlington-3_S39_R1 Arlington-4_S40_R1 Boult-1_S25_R1
## 1 1 1
## Boult-2_S26_R1 Boult-3_S27_R1 Boult-4_S28_R1
## 1 1 1
## Broomfield-1_S49_R1 Broomfield-3_S51_R1 Broomfield-4_S52_R1
## 1 1 1
## Broomfield-rpt-2_S115_R1 Centipede-1_S57_R1 Centipede-2_S58_R1
## 1 1 1
## Centipede-3_S59_R1 Centipede-4_S60_R1 Chicken-1_S69_R1
## 1 1 1
## Chicken-2_S70_R1 Chicken-3_S71_R1 Chicken-4_S72_R1
## 1 1 1
## Chinaman-1_S65_R1 Chinaman-2_S66_R1 Chinaman-3_S67_R1
## 1 1 1
## Chinaman-4_S68_R1 Corbett-1_S17_R1 Corbett-2_S18_R1
## 1 1 1
## Corbett-3_S19_R1 Corbett-4_S20_R1 Davie-1_S1_R1
## 1 1 1
## Davie-2_S2_R1 Davie-3_S3_R1 Davie-4_S4_R1
## 1 1 1
## Erskine-1_S61_R1 Erskine-2_S62_R1 Erskine-3_S63_R1
## 1 1 1
## Erskine-4_S64_R1 Fairfax-1_S33_R1 Fairfax-2_S34_R1
## 1 1 1
## Fairfax-3_S35_R1 Fairfax-4_S36_R1 Farquaharson-1_S1_R1
## 1 1 1
## Farquaharson-2_S2_R1 Farquaharson-3_S3_R1 Farquaharson-4_S4_R1
## 1 1 1
## Feather-1_S5_R1 Feather-2_S6_R1 Feather-3_S7_R1
## 1 1 1
## Feather-4_S8_R1 Fore-and-Aft-1_S77_R1 Fore-and-Aft-2_S78_R1
## 1 1 1
## Fore-and-Aft-3_S79_R1 Fore-and-Aft-4_S80_R1 Fork-1_S49_R1
## 1 1 1
## Fork-2_S50_R1 Fork-3_S51_R1 Fork-4_S52_R1
## 1 1 1
## Grub-1_S65_R1 Grub-2_S66_R1 Grub-3_S67_R1
## 1 1 1
## Grub-4_S68_R1 Hastings-1_S41_R1 Hastings-2_S42_R1
## 1 1 1
## Hastings-3_S43_R1 Hastings-4_S44_R1 Hedley-1_S21_R1
## 1 1 1
## Hedley-2_S22_R1 Hedley-3_S23_R1 Helix-1_S61_R1
## 1 1 1
## Helix-2_S62_R1 Helix-3_S63_R1 Helix-4_S64_R1
## 1 1 1
## Hoskyn-1_S29_R1 Hoskyn-2_S30_R1 Hoskyn-3_S31_R1
## 1 1 1
## Hoskyn-4_S32_R1 JohnBrewer-1_S93_R1 JohnBrewer-2_S94_R1
## 1 1 1
## JohnBrewer-3_S97_R1 JohnBrewer-4_S98_R1 Kelso-1_S85_R1
## 1 1 1
## Kelso-2_S86_R1 Kelso-3_S87_R1 Kelso-4_S88_R1
## 1 1 1
## Knife-1_S45_R1 Knife-2_S46_R1 Knife-3_S47_R1
## 1 1 1
## Knife-4_S48_R1 Lagoon-1_S13_R1 Lagoon-2_S14_R1
## 1 1 1
## Lagoon-3_S15_R1 Lagoon-4_S16_R1 LittleKelso-1_S81_R1
## 1 1 1
## LittleKelso-2_S82_R1 LittleKelso-3_S83_R1 LittleKelso-4_S84_R1
## 1 1 1
## Lynchs-1_S99_R1 Lynchs-2_S100_R1 Lynchs-3_S101_R1
## 1 1 1
## Lynchs-4_S102_R1 Mantis-1_S85_R1 Mantis-2_S86_R1
## 1 1 1
## Mantis-3_S87_R1 Mantis-4_S88_R1 Masthead-1_S53_R1
## 1 1 1
## Masthead-2_S54_R1 Masthead-3_S55_R1 Masthead-4_S56_R1
## 1 1 1
## McCulloch-1_S17_R1 McCulloch-2_S18_R1 McCulloch-3_S19_R1
## 1 1 1
## McCulloch-4_S20_R1 McSweeney-1_S5_R1 McSweeney-2_S6_R1
## 1 1 1
## McSweeney-3_S7_R1 McSweeney-4_S8_R1 Monsoon-1_S21_R1
## 1 1 1
## Monsoon-2_S22_R1 Monsoon-3_S23_R1 Monsoon-4_S24_R1
## 1 1 1
## Moore-1_S25_R1 Moore-2_S26_R1 Moore-3_S27_R1
## 1 1 1
## Moore-4_S28_R1 Myrmidon-1_S53_R1 Myrmidon-2_S54_R1
## 1 1 1
## Myrmidon-3_S55_R1 Myrmidon-4_S56_R1 North-1_S37_R1
## 1 1 1
## North-2_S38_R1 North-3_S39_R1 North-4_S40_R1
## 1 1 1
## Peart-1_S13_R1 Peart-2_S14_R1 Peart-3_S15_R1
## 1 1 1
## Peart-4_S16_R1 Rib-1_S73_R1 Rib-2_S74_R1
## 1 1 1
## Rib-3_S75_R1 Rib-4_S76_R1 Roxburgh-1_S89_R1
## 1 1 1
## Roxburgh-2_S90_R1 Roxburgh-3_S91_R1 Roxburgh-4_S92_R1
## 1 1 1
## Sanbank1-1_S77_R1 Sanbank1-2_S78_R1 Sanbank1-3_S79_R1
## 1 1 1
## Sanbank1-4_S80_R1 SmallLagoon-1_S45_R1 SmallLagoon-2_S46_R1
## 1 1 1
## SmallLagoon-3_S47_R1 SmallLagoon-4_S48_R1 St-Crispin-1_S73_R1
## 1 1 1
## St-Crispin-2_S74_R1 St-Crispin-3_S75_R1 St-Crispin-4_S76_R1
## 1 1 1
## Taylor-1_S9_R1 Taylor-2_S10_R1 Taylor-3_S11_R1
## 1 1 1
## Taylor-4_S12_R1 Thetford-1_S29_R1 Thetford-2_S30_R1
## 1 1 1
## Thetford-3_S31_R1 Thetford-4_S32_R1
## 1 1
bray_curtis_phylum <- vegdist(t(PHYLUM_RA), # needs transposing
method = "bray", # I am computing Bray Curtis dissimilarity
diag = F, # diagonals will be 0 as it's on the same samples
upper = TRUE) %>% # Only the upper bit of the matrix
as.matrix() %>% # Output as matrix
reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
left_join(., data.frame(sample_data(megan_genus_clr)) %>%
rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
left_join(., data.frame(sample_data(megan_genus_clr)) %>%
rownames_to_column("Var2")) %>% # Now merging based on Var2
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_phylum <- dplyr::filter(bray_curtis_phylum, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level!
dplyr::filter(value != 0) %>% # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
mutate(Bray_Curtis_similarity = 1-value) %>% # computing Bray Curtis similarity as 1 - BC dissimilarity
left_join(., data.frame(sample_data(megan_genus_clr))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_phylum_median <- round(median(bray_curtis_phylum$Bray_Curtis_similarity), digits = 2)
bray_curtis_phylum_SD <- round(sd(bray_curtis_phylum$Bray_Curtis_similarity), digits = 2)
bray_curtis_phylum_minimum <- round(min(bray_curtis_phylum$Bray_Curtis_similarity), digits = 2)
# Plotting as bocplots:
bray_curtis_phylum_boxplots <- bray_curtis_phylum %>%
ggplot(aes(x = "Within replicate for taxa (Phylum level)",
y = Bray_Curtis_similarity,
# col = Sampling_trip
)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
stat_summary(aes(label = paste("Median:", bray_curtis_phylum_median,
"\nSD:", bray_curtis_phylum_SD,
"\nMin:", bray_curtis_phylum_minimum)),
fun.y = median,
geom = "text",
color = "black"
) + # median and SD as text
stat_summary(fun=mean,
geom="point",
shape=20,
size=1.5,
color="seagreen1",
fill="seagreen1") + # Plotting the mean as a green dot!
scale_color_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
) +
ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
ylab("Bray-Curtis similarity") +
theme(legend.position = "none")
# GO terms - Rank 3
bray_curtis_GO3 <- vegdist(megan_GO_3_RA_no_rare@otu_table %>% # Getting my table with relative abundances
t() %>% # needs to be transposed though
as.data.frame(), # df
method = "bray", # I am computing Bray Curtis dissimilarity
diag = F, # diagonals will be 0 as it's on the same samples
upper = TRUE) %>% # Only the upper bit of the matrix
as.matrix() %>% # Output as matrix
reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
left_join(., data.frame(sample_data(megan_go_clr_3)) %>%
rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
left_join(., data.frame(sample_data(megan_go_clr_3)) %>%
rownames_to_column("Var2")) %>% # Now merging based on Var2
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_GO3 <- dplyr::filter(bray_curtis_GO3, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level!
dplyr::filter(value != 0) %>% # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
mutate(Bray_Curtis_similarity = 1-value) %>% # computing Bray Curtis similarity as 1 - BC dissimilarity
left_join(., data.frame(sample_data(megan_go_clr_3))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_GO3_median <- round(median(bray_curtis_GO3$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO3_SD <- round(sd(bray_curtis_GO3$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO3_minimum <- round(min(bray_curtis_GO3$Bray_Curtis_similarity), digits = 2)
# Plotting as bocplots:
bray_curtis_GO3_boxplots <- bray_curtis_GO3 %>%
ggplot(aes(x = "Within replicate for GO terms (Rank 3)",
y = Bray_Curtis_similarity,
# col = Sampling_trip
)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
stat_summary(aes(label = paste("Median:", bray_curtis_GO3_median,
"\nSD:", bray_curtis_GO3_SD,
"\nMin:", bray_curtis_GO3_minimum)),
fun.y = median,
geom = "text",
color = "black"
) + # median and SD as text
stat_summary(fun=mean,
geom="point",
shape=20,
size=1.5,
color="seagreen1",
fill="seagreen1") + # Plotting the mean as a green dot!
scale_color_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
) +
ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
ylab("Bray-Curtis similarity") +
theme(legend.position = "none")
# GO terms - Rank 4
bray_curtis_GO4 <- vegdist(megan_GO_4_RA_no_rare@otu_table %>% # Getting my table with relative abundances
t() %>% # needs to be transposed though
as.data.frame(), # df
method = "bray", # I am computing Bray Curtis dissimilarity
diag = F, # diagonals will be 0 as it's on the same samples
upper = TRUE) %>% # Only the upper bit of the matrix
as.matrix() %>% # Output as matrix
reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
left_join(., data.frame(sample_data(megan_go_clr_4)) %>%
rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
left_join(., data.frame(sample_data(megan_go_clr_4)) %>%
rownames_to_column("Var2")) %>% # Now merging based on Var2
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_GO4 <- dplyr::filter(bray_curtis_GO4, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level!
dplyr::filter(value != 0) %>% # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
mutate(Bray_Curtis_similarity = 1-value) %>% # computing Bray Curtis similarity as 1 - BC dissimilarity
left_join(., data.frame(sample_data(megan_go_clr_4))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_GO4_median <- round(median(bray_curtis_GO4$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO4_mean <- round(mean(bray_curtis_GO4$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO4_SD <- round(sd(bray_curtis_GO4$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO4_minimum <- round(min(bray_curtis_GO4$Bray_Curtis_similarity), digits = 2)
# Plotting as bocplots:
bray_curtis_GO4_boxplots <- bray_curtis_GO4 %>%
ggplot(aes(x = "Within replicate for GO terms (Rank 4)",
y = Bray_Curtis_similarity,
# col = Sampling_trip
)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
stat_summary(aes(label = paste("Median:", bray_curtis_GO4_median,
"\nMean:", bray_curtis_GO4_mean,
"\nSD:", bray_curtis_GO4_SD,
"\nMin:", bray_curtis_GO4_minimum)),
fun.y = median,
geom = "text",
color = "black"
) + # median and SD as text
stat_summary(fun=mean,
geom="point",
shape=20,
size=1.5,
color="seagreen1",
fill="seagreen1") + # Plotting the mean as a green dot!
scale_color_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
) +
ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
ylab("Bray-Curtis similarity") +
theme(legend.position = "none")
# GO terms - Rank 4
bray_curtis_GO5 <- vegdist(megan_GO_5_RA_no_rare@otu_table %>% # Getting my table with relative abundances
t() %>% # needs to be transposed though
as.data.frame(), # df
method = "bray", # I am computing Bray Curtis dissimilarity
diag = F, # diagonals will be 0 as it's on the same samples
upper = TRUE) %>% # Only the upper bit of the matrix
as.matrix() %>% # Output as matrix
reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
left_join(., data.frame(sample_data(megan_go_clr_5)) %>%
rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
left_join(., data.frame(sample_data(megan_go_clr_5)) %>%
rownames_to_column("Var2")) %>% # Now merging based on Var2
dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_GO5 <- dplyr::filter(bray_curtis_GO5, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level!
dplyr::filter(value != 0) %>% # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
mutate(Bray_Curtis_similarity = 1-value) %>% # computing Bray Curtis similarity as 1 - BC dissimilarity
left_join(., data.frame(sample_data(megan_go_clr_5))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_GO5_median <- round(median(bray_curtis_GO5$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO5_mean <- round(mean(bray_curtis_GO5$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO5_SD <- round(sd(bray_curtis_GO5$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO5_minimum <- round(min(bray_curtis_GO5$Bray_Curtis_similarity), digits = 2)
# Plotting as bocplots:
bray_curtis_GO5_boxplots <- bray_curtis_GO5 %>%
ggplot(aes(x = "Within replicate for GO terms (Rank 5)",
y = Bray_Curtis_similarity,
# col = Sampling_trip
)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
stat_summary(aes(label = paste("Median:", bray_curtis_GO5_median,
"\nMean:", bray_curtis_GO5_mean,
"\nSD:", bray_curtis_GO5_SD,
"\nMin:", bray_curtis_GO5_minimum)),
fun.y = median,
geom = "text",
color = "black"
) + # median and SD as text
stat_summary(fun=mean,
geom="point",
shape=20,
size=1.5,
color="seagreen1",
fill="seagreen1") + # Plotting the mean as a green dot!
scale_color_manual(values = c("indianred", # Sampling trip 1
"indianred4", # Sampling trip 2
"red3", # Sampling trip 3
"slateblue") # Sampling trip 4
) +
ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
ylab("Bray-Curtis similarity") +
theme(legend.position = "none")
plot_grid(bray_curtis_genus_boxplots, # taxonomy first:
bray_curtis_family_boxplots,
bray_curtis_order_boxplots,
bray_curtis_class_boxplots,
bray_curtis_phylum_boxplots,
bray_curtis_GO5_boxplots, # Microbial function
bray_curtis_GO4_boxplots,
bray_curtis_GO3_boxplots,
ncol = 6, nrow = 1)
The global phyloseq object was also split into four subsets (separate phyloseq objects within each of the four transects), to explore how microbial taxa/genes correlate to WQ metrics within each of the trips, focussing primarily on stability.
We can examine how frequently each variable is selected when we subsample the data using the perf() function to measure how stable the signature is.
stab.trip1.final.spls2.WQ.taxa <- perf(trip1.final.spls2.WQ.taxa,
validation = 'Mfold',
folds = 4,
nrepeat = 50)
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
stab.trip1.final.spls2.WQ.GOs <- perf(trip1.final.spls2.WQ.GOs,
validation = 'Mfold',
folds = 4,
nrepeat = 50)
# Extract stability - Dimension 1
stab.Taxa.trip1.comp1 <- stab.trip1.final.spls2.WQ.taxa$features$stability.X$comp1
stab.GOs.trip1.comp1 <- stab.trip1.final.spls2.WQ.GOs$features$stability.X$comp1
# We extract the stability measures of only the variables selected in spls2
extr.stab.Taxa.trip1.comp1 <- stab.Taxa.trip1.comp1[selectVar(trip1.final.spls2.WQ.taxa,
comp =1)$X$name]
extr.stab.GOs.trip1.comp1 <- stab.GOs.trip1.comp1[selectVar(trip1.final.spls2.WQ.GOs,
comp =1)$X$name]
# Plotting stability scores as boxplots, while simultaneously doing the Wilcoxon rank sum test - are these differences statistically significant?
# Preparing the object first:
Wilcoxon_trip_1_stability <- bind_rows(as.data.frame(extr.stab.Taxa.trip1.comp1) %>%
dplyr::rename(., Stability_scores_Trip_1 = extr.stab.Taxa.trip1.comp1) %>%
mutate(Comparison = "1_Taxa"),
as.data.frame(extr.stab.GOs.trip1.comp1) %>%
dplyr::rename(., Stability_scores_Trip_1 = extr.stab.GOs.trip1.comp1) %>%
mutate(Comparison = "2_Functions"))
# Exporting this as a numerical value as well:
write.csv(Wilcoxon_trip_1_stability, "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/trip_1_sPLS_stability_scores.csv", row.names = F)
# Stability scores - getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
trip_1_stability_numerical_summ <- ddply(Wilcoxon_trip_1_stability,
.(Comparison),
summarize,
med = median(Stability_scores_Trip_1),
SD = sd(Stability_scores_Trip_1))
# Now plotting:
ggplot(Wilcoxon_trip_1_stability,
aes(x = Comparison, y = Stability_scores_Trip_1)) +
geom_boxplot(fill = "indianred", outlier.shape = NA) +
geom_jitter(size=0.8, alpha=0.5) +
geom_text(data = trip_1_stability_numerical_summ, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
geom_text(data = trip_1_stability_numerical_summ, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text
ylim(0,1) +
labs(#x = "Microbial function",
y = "Re-occurrence of indicator features at PC 1 (4-fold CV, 50 reps)"
) +
theme_bw() +
stat_pvalue_manual(Wilcoxon_trip_1_stability %>%
pairwise_wilcox_test(Stability_scores_Trip_1 ~ Comparison) %>%
add_xy_position())
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).
# Then I edited this manually in Inkscape
# Optimal parameters
keepX <- c(10, 10) # Keeping the same value across all trips!
trip2.final.spls2.WQ.taxa <- spls(X = OTUs_Trip2,
Y = metadata_Trip2[,24:40], # Choosing only medians!
ncomp = 2,
keepX = keepX,
mode = "regression")
trip2.final.spls2.WQ.GOs <- spls(X = GOs_Trip2,
Y = metadata_Trip2[,24:40], # Choosing only medians!
ncomp = 2,
keepX = keepX,
mode = "regression")
We can examine how frequently each variable is selected when we subsample the data using the perf() function to measure how stable the signature is.
stab.trip2.final.spls2.WQ.taxa <- perf(trip2.final.spls2.WQ.taxa,
validation = 'Mfold',
folds = 4,
nrepeat = 20)
stab.trip2.final.spls2.WQ.GOs <- perf(trip2.final.spls2.WQ.GOs,
validation = 'Mfold',
folds = 4,
nrepeat = 50)
# Extract stability - Dimension 1
stab.Taxa.trip2.comp1 <- stab.trip2.final.spls2.WQ.taxa$features$stability.X$comp1
stab.GOs.trip2.comp1 <- stab.trip2.final.spls2.WQ.GOs$features$stability.X$comp1
# We extract the stability measures of only the variables selected in spls2
extr.stab.Taxa.trip2.comp1 <- stab.Taxa.trip2.comp1[selectVar(trip2.final.spls2.WQ.taxa,
comp =1)$X$name]
extr.stab.GOs.trip2.comp1 <- stab.GOs.trip2.comp1[selectVar(trip2.final.spls2.WQ.GOs,
comp =1)$X$name]
# Plotting stability scores as boxplots, while simultaneously doing the Wilcoxon rank sum test - are these differences statistically significant?
# Preparing the object first:
Wilcoxon_trip_2_stability <- bind_rows(as.data.frame(extr.stab.Taxa.trip2.comp1) %>%
dplyr::rename(., Stability_scores_Trip_2 = extr.stab.Taxa.trip2.comp1) %>%
mutate(Comparison = "1_Taxa"),
as.data.frame(extr.stab.GOs.trip2.comp1) %>%
dplyr::rename(., Stability_scores_Trip_2 = extr.stab.GOs.trip2.comp1) %>%
mutate(Comparison = "2_Functions"))
# Exporting this as a numerical value as well:
write.csv(Wilcoxon_trip_2_stability, "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/trip_2_sPLS_stability_scores.csv", row.names = F)
# Stability scores - getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
trip_2_stability_numerical_summ <- ddply(Wilcoxon_trip_2_stability,
.(Comparison),
summarize,
med = median(Stability_scores_Trip_2),
SD = sd(Stability_scores_Trip_2))
# Now plotting:
ggplot(Wilcoxon_trip_2_stability,
aes(x = Comparison, y = Stability_scores_Trip_2)) +
geom_boxplot(fill = "indianred4", outlier.shape = NA) +
geom_jitter(size=0.8, alpha=0.5) +
geom_text(data = trip_2_stability_numerical_summ, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
geom_text(data = trip_2_stability_numerical_summ, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text
ylim(0,1) +
labs(#x = "Microbial function",
y = "Re-occurrence of indicator features at PC 1 (4-fold CV, 50 reps)"
) +
theme_bw() +
stat_pvalue_manual(Wilcoxon_trip_2_stability %>%
pairwise_wilcox_test(Stability_scores_Trip_2 ~ Comparison) %>%
add_xy_position())
# Then I edited this manually in Inkscape
# Optimal parameters
keepX <- c(10, 10) # Keeping the same value across all trips!
trip3.final.spls2.WQ.taxa <- spls(X = OTUs_Trip3,
Y = metadata_Trip3.imputed, # Choosing only medians!
ncomp = 2,
keepX = keepX,
mode = "regression")
trip3.final.spls2.WQ.GOs <- spls(X = GOs_Trip3,
Y = metadata_Trip3.imputed, # Choosing only medians!
ncomp = 2,
keepX = keepX,
mode = "regression")
We can examine how frequently each variable is selected when we subsample the data using the perf() function to measure how stable the signature is.
stab.trip3.final.spls2.WQ.taxa <- perf(trip3.final.spls2.WQ.taxa,
validation = 'Mfold',
folds = 4,
nrepeat = 20)
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
stab.trip3.final.spls2.WQ.GOs <- perf(trip3.final.spls2.WQ.GOs,
validation = 'Mfold',
folds = 4,
nrepeat = 50)
# Extract stability - Dimension 1
stab.Taxa.trip3.comp1 <- stab.trip3.final.spls2.WQ.taxa$features$stability.X$comp1
stab.GOs.trip3.comp1 <- stab.trip3.final.spls2.WQ.GOs$features$stability.X$comp1
# We extract the stability measures of only the variables selected in spls2
extr.stab.Taxa.trip3.comp1 <- stab.Taxa.trip3.comp1[selectVar(trip3.final.spls2.WQ.taxa,
comp =1)$X$name]
extr.stab.GOs.trip3.comp1 <- stab.GOs.trip3.comp1[selectVar(trip3.final.spls2.WQ.GOs,
comp =1)$X$name]
# Plotting stability scores as boxplots, while simultaneously doing the Wilcoxon rank sum test - are these differences statistically significant?
# Preparing the object first:
Wilcoxon_trip_3_stability <- bind_rows(as.data.frame(extr.stab.Taxa.trip3.comp1) %>%
dplyr::rename(., Stability_scores_Trip_3 = extr.stab.Taxa.trip3.comp1) %>%
mutate(Comparison = "1_Taxa"),
as.data.frame(extr.stab.GOs.trip3.comp1) %>%
dplyr::rename(., Stability_scores_Trip_3 = extr.stab.GOs.trip3.comp1) %>%
mutate(Comparison = "2_Functions"))
# Exporting this as a numerical value as well:
write.csv(Wilcoxon_trip_3_stability, "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/trip_3_sPLS_stability_scores.csv", row.names = F)
# Stability scores - getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
trip_3_stability_numerical_summ <- ddply(Wilcoxon_trip_3_stability,
.(Comparison),
summarize,
med = median(Stability_scores_Trip_3),
SD = sd(Stability_scores_Trip_3))
# Now plotting:
ggplot(Wilcoxon_trip_3_stability,
aes(x = Comparison, y = Stability_scores_Trip_3)) +
geom_boxplot(fill = "red3", outlier.shape = NA) +
geom_jitter(size=0.8, alpha=0.5) +
geom_text(data = trip_3_stability_numerical_summ, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
geom_text(data = trip_3_stability_numerical_summ, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text
ylim(0,1) +
labs(#x = "Microbial function",
y = "Re-occurrence of indicator features at PC 1 (4-fold CV, 50 reps)"
) +
theme_bw() +
stat_pvalue_manual(Wilcoxon_trip_3_stability %>%
pairwise_wilcox_test(Stability_scores_Trip_3 ~ Comparison) %>%
add_xy_position())
# Then I edited this manually in Inkscape
# Optimal parameters
keepX <- c(50, 50) # Keeping the same value across all trips!
trip4.final.spls2.WQ.taxa <- spls(X = OTUs_Trip4,
Y = metadata_Trip4[,24:40], # Choosing only medians!
ncomp = 2,
keepX = keepX,
mode = "regression")
trip4.final.spls2.WQ.GOs <- spls(X = GOs_Trip4,
Y = metadata_Trip4[,24:40], # Choosing only medians!
ncomp = 2,
keepX = keepX,
mode = "regression")
We can examine how frequently each variable is selected when we subsample the data using the perf() function to measure how stable the signature is.
stab.trip4.final.spls2.WQ.taxa <- perf(trip4.final.spls2.WQ.taxa,
validation = 'Mfold',
folds = 4,
nrepeat = 20)
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
stab.trip4.final.spls2.WQ.GOs <- perf(trip4.final.spls2.WQ.GOs,
validation = 'Mfold',
folds = 4,
nrepeat = 50)
# Extract stability - Dimension 1
stab.Taxa.trip4.comp1 <- stab.trip4.final.spls2.WQ.taxa$features$stability.X$comp1
stab.GOs.trip4.comp1 <- stab.trip4.final.spls2.WQ.GOs$features$stability.X$comp1
# We extract the stability measures of only the variables selected in spls2
extr.stab.Taxa.trip4.comp1 <- stab.Taxa.trip4.comp1[selectVar(trip4.final.spls2.WQ.taxa,
comp =1)$X$name]
extr.stab.GOs.trip4.comp1 <- stab.GOs.trip4.comp1[selectVar(trip4.final.spls2.WQ.GOs,
comp =1)$X$name]
# Plotting stability scores as boxplots, while simultaneously doing the Wilcoxon rank sum test - are these differences statistically significant?
# Preparing the object first:
Wilcoxon_trip_4_stability <- bind_rows(as.data.frame(extr.stab.Taxa.trip4.comp1) %>%
dplyr::rename(., Stability_scores_Trip_4 = extr.stab.Taxa.trip4.comp1) %>%
mutate(Comparison = "1_Taxa"),
as.data.frame(extr.stab.GOs.trip4.comp1) %>%
dplyr::rename(., Stability_scores_Trip_4 = extr.stab.GOs.trip4.comp1) %>%
mutate(Comparison = "2_Functions"))
# Exporting this as a numerical value as well:
write.csv(Wilcoxon_trip_4_stability, "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/trip_4_sPLS_stability_scores.csv", row.names = F)
# Stability scores - getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
trip_4_stability_numerical_summ <- ddply(Wilcoxon_trip_4_stability,
.(Comparison),
summarize,
med = median(Stability_scores_Trip_4),
SD = sd(Stability_scores_Trip_4))
# Now plotting:
ggplot(Wilcoxon_trip_4_stability,
aes(x = Comparison, y = Stability_scores_Trip_4)) +
geom_boxplot(fill = "slateblue", outlier.shape = NA) +
geom_jitter(size=0.8, alpha=0.5) +
geom_text(data = trip_4_stability_numerical_summ, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
geom_text(data = trip_4_stability_numerical_summ, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text
ylim(0,1) +
labs(#x = "Microbial function",
y = "Re-occurrence of indicator features at PC 1 (4-fold CV, 50 reps)"
) +
theme_bw() +
stat_pvalue_manual(Wilcoxon_trip_4_stability %>%
pairwise_wilcox_test(Stability_scores_Trip_4 ~ Comparison) %>%
add_xy_position())
## Warning: Removed 59 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bracket()`).
## Warning: Removed 59 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).
# Then I edited this manually in Inkscape